diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/README.md b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..592698da998163413a3ef51dc6ad9c7967cb4fb0
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e538c5ac298c3438a38183b05d406f1246eed4f1
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba877586bdd14eeb401b52760ab6fcbfa98f6b86828c47f86282709dce6d6d1f
+size 671150064
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e452c9fb3dfc074390e686ba821ef6aacd9d015f
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d03b5e8dc93898c8165f95efc97f379e2efafe1851803ae36808a028dac30494
+size 918507402
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e9d9a5f1cdd039c6f2c6b128ea87573e9624966
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0546417621099164,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3039,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9830943947720656,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2847,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.9632868978760455,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3293,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8028815117384435,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.2719,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7516661661974123,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.2206,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8414762671394682,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.2424,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.0719234020937745,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.2676,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.0134242689859672,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.1404,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8225916239244949,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.1376,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7974580027045621,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.2182,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.704284118137154,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.1572,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6777979470385621,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.1057,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6789337666737415,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 1.1602,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7721367493509489,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.2724,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7706380455996645,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 1.2377,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7784545094083802,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 1.1413,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.749242634067648,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 1.1777,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.7185554696627132,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 1.1124,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.6592195155815092,
+      "learning_rate": 0.0002,
+      "loss": 1.1949,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.631844503497304,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 1.2025,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.7375870476144077,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 1.1719,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6787816889277812,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 1.2066,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.7576947108984312,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 1.2111,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.6636555852178934,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 1.1047,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6608205468484,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 1.1325,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.6580649276744608,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 1.1296,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.6181390277000566,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 1.1378,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6590646065212499,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 1.1263,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.6047686546944601,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 1.1123,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6035684316870298,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 1.1814,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.6126950655591841,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 1.0809,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6234630001312299,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 1.1213,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.6623103513703723,
+      "learning_rate": 0.00019973673694024,
+      "loss": 1.1816,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6621332778326454,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 1.1123,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.6071891514167586,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 1.167,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.7156277519435071,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 1.1658,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.681368373048502,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 1.1174,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.7932939060254909,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 1.0908,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.6906987465114542,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 1.1616,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.6874309548801129,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 1.1605,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.6282989911917458,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 1.1743,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6744210571411562,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 1.1307,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.7070340415480011,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 1.1741,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6457534010554816,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 1.1518,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.6965489277838409,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 1.2142,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.6654401256010378,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 1.1446,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.7663072538177734,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 1.1223,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.792057852336374,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 1.1632,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.6436782377110332,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 1.1619,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6157428135263696,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 1.0433,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.6226083999846979,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 1.0906,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6801892578820706,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 1.1518,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.7088306631426905,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 1.1941,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.6635052206019092,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 1.1591,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.716943879979395,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 1.1428,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.6548821918063852,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 1.1312,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.6836426283530023,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 1.145,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.6770189268629901,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 1.15,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.6351435125678934,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 1.1808,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6273220292429621,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 1.0943,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.6203846312691548,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 1.0967,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.6271331300704993,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 1.1334,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.7096674154327739,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 1.1947,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.6330759609597704,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 1.114,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.6100844626495261,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 1.144,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.6091114842719405,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 1.1652,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.6556844792427164,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 1.1038,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.6341993533021986,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 1.0707,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.6380101782483953,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 1.2002,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.6361615932320083,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 1.1521,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.6700457625203912,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 1.1307,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.6359055014529901,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 1.157,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.6123561405755759,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 1.1568,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.7014447167671706,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 1.1119,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.6320209813099228,
+      "learning_rate": 0.000195815455670239,
+      "loss": 1.1112,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.6707111589277143,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 1.1354,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.7564097536830839,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 1.1066,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.6465610953572963,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 1.1948,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.6676598076887175,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 1.1194,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6000600146282332,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 1.0952,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.6002780664917489,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 1.1375,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.6568564993107725,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 1.1356,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.7141836382280967,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 1.1531,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.6419253513780898,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 1.0616,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.6720406837651764,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 1.1541,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.6842157925259716,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 1.0981,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.6331702893953941,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 1.1176,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.6386889388907796,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 1.0773,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.6643129226720837,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 1.1026,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6344308328394509,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 1.1277,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.7128590119998348,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 1.1146,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.6603884903952707,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 1.1863,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.7718143043284744,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 1.2007,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.6785922138631665,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 1.0879,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.6223669409606347,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 1.1234,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.6916554006299716,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 1.143,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.6657548722504808,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 1.1172,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.6576599676912572,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 1.1701,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.6797676267700059,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 1.157,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6298399264123691,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 1.0768,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.6724699973574635,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 1.1816,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.6453153869164902,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 1.1291,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.6535384061885604,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 1.1013,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.6911193392100621,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 1.2058,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.6411169183948418,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 1.1346,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.6796989475899463,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 1.1171,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.6508768473789491,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 1.0918,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.6393050631138721,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 1.0671,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.6939986073523474,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 1.2096,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6267743426699197,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 1.1888,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.6641671795879526,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 1.127,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.660607208786974,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 1.0923,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.6625425209929494,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 1.137,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.6577219507067429,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 1.1026,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.6657059122075459,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 1.1722,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.6502773168148378,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 1.0775,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.6723633523053792,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 1.1284,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.6507179596192971,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 1.0528,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 3.423947246709581,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 1.0908,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.6136953582790347,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 1.1664,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.6626749371587253,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 1.1671,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.640247422131599,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 1.0805,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.6126873335303792,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 1.1029,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.6160471874719937,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 1.1079,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5883564595153882,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 1.0703,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.6598253644642209,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 1.109,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.6570694604926712,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 1.1638,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.6264482254749499,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 1.0336,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.717132874781554,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 1.1726,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.6423661485850787,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 1.1354,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.6747010662689331,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 1.122,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.6166480705864257,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 1.1815,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.6566104502085495,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 1.1142,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.7193281722634296,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 1.1166,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.6676076619134492,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 1.1258,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.6312704484126281,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 1.0746,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.65951286979931,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 1.1657,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.6687293279216957,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 1.1179,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.6893653003583647,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 1.1553,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.6096273872213167,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 1.1289,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.6554943252484647,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 1.1123,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.667819283221279,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 1.1444,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.635370819590291,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 1.071,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.6398152170324534,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 1.113,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.628346246018907,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 1.0744,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.6743562786957189,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 1.0641,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.6667833948751095,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 1.1515,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.6415340689159599,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 1.1415,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.6333879262471349,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 1.2205,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.8344978784124626,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 1.1973,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.7222556893403116,
+      "learning_rate": 0.000177485710710289,
+      "loss": 1.1104,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.6215683011406403,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 1.1403,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.6109772882001075,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 1.1608,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.678073310646549,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 1.0891,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.6889100814206688,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 1.1428,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.6149078322374422,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 1.0604,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.6213440946326139,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 1.0861,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.6355751515769437,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 1.1096,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.6009966959973844,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 1.1754,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.6007441577098858,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 1.1204,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.6701014376613954,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 1.2303,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.6035855457967132,
+      "learning_rate": 0.000173756913120621,
+      "loss": 1.14,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.6709387489080831,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 1.0787,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.6836692135768181,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 1.0743,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.6319277254795637,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 1.1492,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.6463831638298415,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 1.192,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.7527928715648954,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 1.1113,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.6272003062313035,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 1.1244,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.6544886461846363,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 1.1129,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.615466409744761,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 1.1855,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.656712861470478,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 1.0617,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.646860440733178,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 1.1599,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.6211236590895497,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 1.154,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5861204189688349,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 1.0823,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6332202970301454,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 1.0757,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.6111843241797855,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 1.0858,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.5896095181736208,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 1.0593,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.6466936019533218,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 1.1672,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.6341290170394617,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 1.1156,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.658258953640711,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 1.1042,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.6715237401598323,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 1.1207,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.6574781590126316,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 1.0561,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5504766238190208,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 1.0991,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5729568943965282,
+      "learning_rate": 0.000165592860169994,
+      "loss": 1.1066,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5807258491260627,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 1.0919,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.6583040186838937,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 1.1763,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.6214088302578543,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 1.0897,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.6118543760188059,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 1.0737,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.6450056173797082,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 1.1265,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.6385339340821453,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 1.1025,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.6353393138176362,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 1.1319,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.5920780756404599,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 1.0875,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.6500357960815165,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 1.0852,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5786362936048618,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 1.1304,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.6209132265885975,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 1.112,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.6611614501557939,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 1.1386,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.6291152809195572,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 1.0792,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.6270903628216398,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 1.0976,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.6267030785334101,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 1.0919,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6097295463842708,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 1.1311,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.6437389287581425,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 1.1136,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.6357551055213047,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 1.089,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.5914555081464148,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 1.1383,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5787205177550582,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 1.1412,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.6841901209326238,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 1.1336,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5917724338215758,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.9994,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.6292805341854978,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 1.0956,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.6219929453367142,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 1.0919,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.638377910535444,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 1.1532,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.675790620281926,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 1.2009,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.6402219984777839,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 1.1403,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.6063356071315705,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 1.137,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.5993748119119959,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 1.1048,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.5336280918399235,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 1.106,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.5683838539145512,
+      "learning_rate": 0.000152669141192587,
+      "loss": 1.115,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.6201028777175007,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 1.0647,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.5765478124877643,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 1.1217,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.6346404807297925,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 1.1293,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.6298913981225904,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 1.1343,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.6041165269569185,
+      "learning_rate": 0.000150448286344864,
+      "loss": 1.0756,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.6174615585600135,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.0934,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.6205122835519024,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 1.1435,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.6117008270075854,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 1.065,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.6647522480570532,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 1.1292,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.607403578016957,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 1.1379,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.6152761080534418,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 1.1079,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.6234030351325274,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.9997,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.6565232302932521,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 1.1298,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.6326307712501449,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 1.1287,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.6164374251918011,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 1.0487,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.5918756231358342,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 1.0911,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.6096400457469056,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 1.0646,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.613651480583095,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 1.0418,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.659552082278267,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 1.1091,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5793594287738426,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 1.0457,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.6144392237777093,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 1.1677,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.592217760483564,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 1.1067,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.6025771269774809,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 1.1178,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.6215619258358068,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 1.1086,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.6104099851538662,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 1.1642,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.6291772631491427,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 1.1268,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5982744552621517,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 1.1979,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.5919791603379914,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 1.1137,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.641074296489744,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 1.1725,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.6416429934908356,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 1.1525,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5887938842762135,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 1.135,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.5810396164082346,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 1.0408,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.6963592619330519,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 1.0093,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.6379142365447,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 1.1498,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6362022994521447,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 1.1476,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.6277834775561327,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 1.0732,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5846790082739997,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 1.131,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.5602623927305204,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 1.0225,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.7355716950696458,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 1.0296,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.6184498365850783,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 1.0972,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.6040828388484982,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 1.0091,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.6231315760950605,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 1.0924,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.5728821740925255,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 1.0333,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.5910177934851119,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 1.0574,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.621552668484609,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 1.0344,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.6019941844395278,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 1.0063,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.6032092128689924,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 1.0428,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.6319555566757853,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 1.1705,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5931098644745313,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 1.1681,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.608186810341348,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 1.0613,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5925589347790949,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 1.0844,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.5772865894737961,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 1.0661,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.6921431549054023,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 1.1091,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.6123370162409066,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 1.1155,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5633139577669003,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 1.1665,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.6049090840872673,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 1.1739,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.594125413834283,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 1.0667,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.5852491344979651,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 1.1152,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.6020109661965061,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 1.0995,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6231518041147731,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 1.001,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.588252719051879,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 1.0376,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.6748959311027869,
+      "learning_rate": 0.000123117632211497,
+      "loss": 1.1397,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.6253113800905918,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 1.0453,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.67648068654321,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.987,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5690792516136426,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 1.0885,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.6676421755003893,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 1.0912,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.6051116729347528,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 1.0397,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.6228985999554482,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 1.131,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5806907674242645,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 1.1041,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.6086409424125504,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 1.0695,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5664293471536089,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 1.0893,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.5875219108972874,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 1.1118,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5772945706026095,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 1.1374,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.6264590951903878,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 1.1053,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.6172016436142939,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 1.0733,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.6019466045224809,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 1.1048,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.6114137676866246,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 1.0182,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.5807423674819735,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 1.0819,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.6156122435693614,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 1.0373,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.6596210421275496,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 1.1059,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6508873442075375,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 1.0745,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.5774941786455217,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 1.0395,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.6078892397371244,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 1.1172,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.6314929637393192,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 1.0749,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6198863199636196,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 1.0738,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.5776271114765191,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 1.112,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.5839794395278837,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 1.0915,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.5752826792204329,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 1.0484,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5816716788845563,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.9258,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5762376254027969,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 1.1193,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5745368903277284,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 1.0653,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.6042074974846938,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 1.1138,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.593217562570839,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 1.0321,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5808461181038312,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 1.1336,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5887470860927632,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 1.1099,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.6498011175723066,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 1.1213,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.6128803546990382,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.9728,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.566857290904936,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 1.0272,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5491573102634107,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 1.0219,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5914292628639273,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 1.0494,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5757693535213086,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 1.1529,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.5917884927626396,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 1.0808,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.6172918542092238,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 1.1486,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.601600212336248,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 1.0696,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5933069876414877,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 1.1544,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.6283424647746162,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 1.0795,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.6081103271482113,
+      "learning_rate": 0.0001,
+      "loss": 1.0291,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.6162262451659752,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 1.1334,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.539529725843605,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 1.1309,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.5845019751186606,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 1.0668,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5326285077787556,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 1.1243,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.6492335865189207,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 1.0785,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.6196210544055624,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 1.0322,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.5630604678872787,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 1.0002,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5708437509002491,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 1.0903,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.6740980273712052,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 1.1051,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5766319817458229,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 1.02,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.6806069107667091,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 1.0738,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.5837857855687947,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 1.059,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5840116539912913,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 1.0302,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.6173685917066777,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 1.1018,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.5485781697871722,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 1.0331,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5414180048504806,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 1.0379,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.58617739354263,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 1.1011,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.582820294272527,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 1.1239,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.5495432917639895,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 1.0539,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5941607713298789,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 1.0581,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.6704769304374639,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 1.0363,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.575579872278294,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 1.0063,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.6174123619332044,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 1.0358,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.7765259631194747,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 1.0777,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.59539903150356,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 1.0031,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5838223468459752,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 1.0088,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5709288860740186,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 1.071,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.616227063456055,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 1.1457,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.5891840856715459,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 1.0601,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.5860896398908301,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.9879,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.5706480687216251,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 1.01,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.6075771275273073,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.9779,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.6116743292999375,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 1.0019,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.9043649158939876,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 1.0332,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.5940214479829944,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 1.074,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.6363560987658411,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 1.0826,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.572695528702038,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 1.0875,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.58278597153404,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 1.1093,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.609417586744702,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 1.044,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5518628421421953,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 1.0395,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.5602503519347858,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 1.0489,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5635320627455994,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 1.0318,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6425108193841863,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 1.0997,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5511827643775233,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 1.1024,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.7937945398354243,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 1.0329,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.6158403969166507,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 1.1045,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.5466499351488862,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 1.0115,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5607938509766125,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 1.0598,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.5573994161160054,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 1.0676,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.5957854875114994,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 1.0617,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.8082040631934779,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 1.0027,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.5427800958181495,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 1.0493,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.680764698748384,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 1.0509,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.6103536964718081,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 1.0078,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.6156365333216228,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 1.1132,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.571025133284332,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.9963,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5979433321264492,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 1.0295,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5706299903062161,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 1.0061,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.6090765543978603,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 1.0722,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.6495559687818583,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 1.0937,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.5392094274203981,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 1.0045,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.6620081615223117,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 1.1003,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.606655196142612,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 1.0994,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.5347660961501969,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 1.078,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.5610082205027388,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 1.1287,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.5929489801715777,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 1.0979,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.5703922937495834,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 1.0169,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.6719211475065047,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 1.0311,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.5837439088175624,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 1.0145,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5840596663526239,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 1.0347,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.5479481912273149,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 1.0433,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5974273349562468,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 1.0157,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.5920150870666145,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 1.0862,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.6040498515191982,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 1.0279,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.6890034751174975,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 1.1037,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.5822406941480194,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.9946,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.5545509446925626,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 1.0459,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6478650349952937,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 1.0448,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.6232439056058718,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 1.0418,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5787774989799778,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 1.1346,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.555604140461821,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 1.0548,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.570207009965997,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 1.04,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.5830691553570289,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 1.0194,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5898401925651079,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 1.0353,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.5281642759595274,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 1.0172,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.6116846301810461,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.968,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5603052935619109,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 1.0278,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.6038247528664656,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 1.0326,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.5547217826242783,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.9975,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5307670577336794,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 1.0589,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.6247846642988907,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.9786,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.540419514977222,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.9571,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.5436204368876351,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.9559,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.5634354388890752,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 1.0899,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.5373452984019687,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 1.0402,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5494376365525082,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 1.075,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.5455359002472352,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 1.0315,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5499899020824084,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.9862,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.567553536918042,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.9562,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.5762050281775415,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.9938,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.587322193307677,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 1.0145,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5642512856058488,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 1.1282,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5827168848287619,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 1.0657,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.6157879835028639,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 1.1517,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5753811424331225,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 1.0513,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5788251938305143,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 1.0326,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.6001375240542786,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 1.1317,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5314931616369382,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 1.1151,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.6037116023113848,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 1.0136,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5529077977630005,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 1.0196,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.5322658689804729,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.986,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5832545717855004,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 1.0697,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5459373413249836,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 1.0586,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.612102133853446,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.8879,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.5271492444349947,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 1.052,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.5372955655329454,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 1.0402,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.6969492636482697,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 1.036,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.6084476972576491,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 1.0911,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.5707168715124262,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 1.123,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.5308859435568433,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.9807,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.5531547327344877,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 1.0088,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.5617715008622371,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 1.0015,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.5831603063758195,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 1.0486,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5453015306649112,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 1.1414,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5467213506991419,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 1.0077,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5838670052403234,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 1.045,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.5316702863240654,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 1.009,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5538127101661766,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 1.018,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.8005565133092393,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 1.0019,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5776009430166992,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 1.0631,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.5167486285150212,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 1.0782,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.578627424855094,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 1.0474,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.5435164656497815,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.9957,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.5719424686541105,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 1.0058,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.5178410334921631,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.978,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.8263260950862493,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 1.0467,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.5362699121320619,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.9266,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5296863145242839,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 1.0413,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.6286430146632545,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 1.0208,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5947703566230248,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 1.0797,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.6150942122576878,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.9078,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.5763878268103857,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 1.0606,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.604256443268453,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 1.0442,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.6540688612386386,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 1.0547,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.5888623442593799,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 1.0155,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5693574593086785,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.9869,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.5754271865799155,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 1.0866,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.6118450847137612,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 1.0135,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.6019488545581383,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.9748,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.6136203176659136,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.9842,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4957658315868677,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 1.0255,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.6032121305642305,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 1.0273,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5613220836478147,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 1.0848,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.5742266347353849,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 1.0718,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.5996512989753369,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 1.0252,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.5513763872864543,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 1.0256,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.616897426658438,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.9867,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.5321083070094431,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 1.0657,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5982846463615108,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.9823,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5510098723862623,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 1.0513,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.5312914631911628,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 1.0319,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.6058974540337424,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.9458,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5603043779884208,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 1.029,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5427880849051706,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.9807,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.5437875716098907,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 1.0841,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.7171952165375328,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 1.0336,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.554435574357804,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.9722,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5593028467456796,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 1.0349,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5593189261001621,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 1.0308,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.6010504002830267,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.9928,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.558405142401561,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 1.0552,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.6038965238199007,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 1.0493,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.5878232251405476,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 1.0058,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.5792843967762865,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 1.0641,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.6207354557243501,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 1.0188,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5949739846778636,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.9854,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.610828974706882,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 1.0607,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.553429692755142,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 1.0354,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.5957778158714045,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.9977,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.6417881675394035,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.9849,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.5759396894351403,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.9924,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.7994891376666073,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.9704,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.5617450444953134,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 1.0302,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.5355895357665624,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 1.0276,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.5915038784491229,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 1.0175,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.5420413166609099,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.9283,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.6141130519430947,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.9567,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5313555292161568,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 1.0618,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.5622424268374584,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 1.049,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.5729505863510094,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 1.042,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.6147367158289911,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 1.0332,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.5744026154737112,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.9674,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5282699306125607,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 1.0449,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.6052632616547621,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.999,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.5289547451299312,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.9661,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5300300048812662,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.9651,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.5263056000491023,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.9892,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5455862581344829,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.9861,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.5670216658855216,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 1.0091,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5714826959991137,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 1.0016,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.6430918485169288,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.9834,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.6422037680407515,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 1.1132,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5333323799870597,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.9638,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.610339329123425,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 1.0494,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.6651824534785188,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 1.0221,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.6134178935957494,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 1.0084,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.5721566730523302,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.971,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5608339377937442,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.9916,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.56075757858118,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 1.0652,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5747500010881719,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 1.0922,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.5501690957931828,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.9825,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.5809208426100437,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 1.0119,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.5790357434096092,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.9896,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5941430671614292,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 1.1058,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.5963745119122649,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.9594,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5942221402502171,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 1.0372,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.6517007896733954,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 1.0051,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5914375297469927,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 1.0353,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.567730923141039,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 1.1078,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.5806300782832331,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 1.0311,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.5396734638468552,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 1.0042,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5735940043391957,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 1.0159,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5737541974889085,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 1.003,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.5547697845181571,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 1.0349,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.5733392761602362,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 1.003,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.5724011437733777,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.957,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.5411210088167315,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 1.0382,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6159552656415724,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 1.0812,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.6425117458362681,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.9973,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.5763828913151351,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 1.0134,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.5521851129054778,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 1.0658,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.5618114300494748,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 1.1078,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.5415127242628184,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 1.0307,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.9441276497123182,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 1.0195,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.5565353021993263,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.9657,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.6106580616838163,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.9943,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5634279784443351,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 1.0729,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5533138320006595,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.9734,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.6347554035913163,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.9627,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.559794125412613,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 1.0309,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.556647745583717,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 1.0235,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.6373655339621795,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 1.0544,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5425862554932976,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 1.004,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.5869350998193746,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.9371,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.5480015247701243,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 1.0562,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.5140315108418996,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.9961,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.6148450538420762,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 1.0191,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5402212289023166,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 1.0148,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.5709596661807356,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 1.0594,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5118843684398965,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 1.0317,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.5527509874809367,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 1.0277,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.5825398681234533,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 1.0096,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.6143820030825954,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 1.0815,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5732341075626708,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 1.0462,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.6258640782593358,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.9541,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5675367593447052,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 1.0565,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.6043933776964004,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 1.0216,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5444010836886899,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 1.0406,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.5884589478752901,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 1.0578,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5796831340346383,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 1.0931,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.5503895574333929,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 1.0234,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.6200970086722541,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 1.0565,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5514254632340541,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.9959,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.6137737755711714,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 1.0731,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.556963861842245,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.9461,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.5629250179170496,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.97,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.539571233600725,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 1.0104,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.5362621634527788,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 1.0284,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.540625025680308,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.9179,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.6002488340998073,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.9858,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.702654490983124,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 1.0847,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.5493589008977616,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.99,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5746085569735714,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 1.0163,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.5811417507334656,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 1.0926,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.5791459585782833,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 1.0457,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5443944524006104,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 1.0076,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.5697409783733853,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.9972,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6029098796366928,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.9167,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.5490495118546089,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.9517,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5745601087440052,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 1.0421,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.5389130677020444,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 1.0575,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.5862136968699087,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.9917,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.5449518186389779,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 1.0193,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5517944068434636,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.9529,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.569226164983525,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 1.0178,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.666594798511301,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 1.0474,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.5463818709913154,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 1.0943,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.582588118145432,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 1.0894,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.5784530517202845,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 1.0296,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5355976778483454,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.964,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 1.6479292551179407,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.9742,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.7998087277041235,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 1.0916,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.6483834592129597,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 1.0472,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.555368245947789,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 1.0817,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.5063818362833399,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 1.0555,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.6122762285657034,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 1.0354,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.5514939749153159,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 1.0028,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5564621643004828,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.9777,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.5797386135805992,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 1.0269,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5755447237388454,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 1.0149,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.5641293628448413,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 1.0056,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.613195023494919,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 1.0408,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5384219802334457,
+      "learning_rate": 0.0,
+      "loss": 1.0434,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 329023610060800.0,
+      "train_loss": 1.077072388458252,
+      "train_runtime": 7633.3454,
+      "train_samples_per_second": 1.31,
+      "train_steps_per_second": 0.082
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 329023610060800.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/README.md b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c82b6353cb3ae238b58f52414bc758819908b03
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b68d69903caea5281d9892c841d6bff1ab14fae9
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6eeb61ea8f79dbf15e1aae58c7b0170375847ec00a4f9c8ad346e2f75e5fef1c
+size 671150064
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a3c91feb2111bcc3c609cb2082e1de825b66c65b
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03b75b38d4fbde04610931840aba206073356e8d0d298a4aa817d786f79f0789
+size 918507402
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..60867f89faca47db79b6fc4a4856ca5c474c0490
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.0877886563071673,
+      "learning_rate": 5e-05,
+      "loss": 1.4204,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9831963058763455,
+      "learning_rate": 0.0001,
+      "loss": 1.2175,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7478913848154679,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2755,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.1851945547125557,
+      "learning_rate": 0.0002,
+      "loss": 1.294,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.9476134964793809,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.2281,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7133074153458693,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.2197,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.7174771013040582,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.2152,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.7669579161324069,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 1.2647,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 1.1079413613918108,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 1.2143,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7583043070072526,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 1.1807,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.8929329853128891,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 1.1392,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.7522497944874978,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 1.1502,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.6440337581737159,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 1.1695,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7692304262417425,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 1.2483,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.6635724770585765,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 1.0857,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6339210471192188,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 1.1672,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.6790889547946641,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 1.088,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6692175057561588,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 1.2209,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.6437761998032046,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 1.122,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6525125502220214,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 1.1809,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.7383920037847639,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 1.2011,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.599229910744791,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 1.1337,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.6368541820149072,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 1.1691,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.6479968056136417,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 1.1646,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.6492295290083023,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 1.1822,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.6581163598715829,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 1.0881,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.6528735853321923,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 1.1763,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.6219951075238257,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 1.0637,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.7509932918198108,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 1.1323,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6626945316885953,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 1.1007,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.7185209593516516,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 1.2184,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.639961093110283,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 1.1013,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.6093800026585015,
+      "learning_rate": 0.000172967916579403,
+      "loss": 1.0853,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5977982267642971,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 1.0534,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6585541370699558,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 1.1287,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.6562897452410896,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 1.1724,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.6419065162769554,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 1.142,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.6683261695427306,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 1.1943,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.6037906950042923,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 1.2452,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6722143393952856,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 1.1435,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5849511558342113,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 1.1326,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.6496843215077648,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 1.0871,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.622273267698997,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 1.1245,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.6248801200159152,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 1.0756,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6350409040736777,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 1.1282,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.6573934364037886,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 1.2089,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.6322079151415101,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 1.1702,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5931281923571887,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 1.1344,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.679086044926115,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 1.0855,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6250561524286053,
+      "learning_rate": 0.000136764169663272,
+      "loss": 1.1218,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5455053641893781,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 1.069,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.6315994964575306,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 1.1472,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.6058785708917978,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 1.0467,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.6163207183209253,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 1.0946,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6262642109854599,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 1.1487,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.6959618230470047,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 1.1448,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5894157323220334,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 1.0709,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.7981960617785065,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 1.1515,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5765858891790445,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 1.0964,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6046649871496205,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 1.1076,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5638841047377291,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 1.1158,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.6248619311895685,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 1.158,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.6194657217776681,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 1.1525,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.6067740187850118,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 1.0888,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.5867746714643287,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 1.2144,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.6162475621164234,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 1.187,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5816260885153179,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 1.1378,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.6092121742013019,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 1.0935,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.6129047955044695,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 1.0815,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5748567355209082,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 1.1137,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.5699053032110166,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 1.125,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5943499018117111,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 1.1111,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5697756101544271,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 1.0353,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5987366993178018,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 1.1787,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.599965475322347,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 1.0606,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.6231622935530702,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 1.1237,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5778865258296028,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 1.07,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.9006399353206177,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 1.0794,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.6080347772853175,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 1.1286,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6215600441401722,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 1.0333,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.5532899039833619,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 1.1528,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.5702002946777001,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 1.1547,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.6414308810709265,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 1.176,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.6023323139281526,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 1.1687,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5892600207157096,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 1.0763,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5339341330615615,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 1.0444,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.6279565355094363,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 1.0997,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.6104692154812912,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 1.1856,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.6114748193401518,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 1.1813,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5517958084848442,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 1.0973,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.5616033058541691,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 1.1052,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5734895525736495,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 1.1017,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.5849100376601271,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 1.0507,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5625815255394412,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 1.1098,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6367740796942681,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 1.1416,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.5795379896457181,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 1.1132,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.6143227738602632,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 1.0109,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.6361204646970395,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 1.0794,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.5930462513939105,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 1.1248,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5733572454846041,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 1.1523,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.6495463046060299,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 1.0592,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5553303058115924,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 1.0753,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5828567133941143,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 1.1418,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5731686378110846,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 1.1059,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5654646756423977,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 1.1122,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.6216056944469172,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 1.0539,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.6030188752416026,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 1.1155,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.6116320512749103,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 1.0367,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.6241071081816516,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 1.0317,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6014326320967628,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 1.0507,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.6058356600728491,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 1.0841,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6327026531424363,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 1.1028,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5560188144128159,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 1.0278,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.683157124015512,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 1.0732,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.5661517981730091,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 1.0709,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5465688334036988,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 1.0601,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.6549818213185664,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 1.1295,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.6120862014550815,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 1.0448,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5638666360794156,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 1.0509,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6363582988990896,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 1.082,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.5875639830126131,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 1.1082,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5494186467411913,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 1.0868,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.64150816142503,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 1.0792,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.6834676443584711,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 1.1639,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5706584527747506,
+      "learning_rate": 0.0,
+      "loss": 1.0897,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 65356609552384.0,
+      "train_loss": 1.127972110748291,
+      "train_runtime": 1526.5357,
+      "train_samples_per_second": 1.31,
+      "train_steps_per_second": 0.082
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 65356609552384.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/README.md b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c5c0067886b078acc06f8b16e5e4456e15c87f4
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "k_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_model.safetensors b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..df081a3bca8324aeb9f125e1dafdeeecd168a316
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c30251737ad375f44282c19e038d1ce7a52bfcfb7c9c4e7799a2e069687d5784
+size 671150064
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/non_lora_trainables.bin b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e31953eef7860edb89ad3d64bcc8e2ae32a3dcfe
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e626b7ff4f509eb7086f12ddce72968709b200a65e1ae2f8112e00a05200d71e
+size 918507402
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/trainer_state.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f24d41ccbb2832582567f1369d7a0956c5dfc996
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/trainer_state.json
@@ -0,0 +1,1792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.0906556698514653,
+      "learning_rate": 2.5e-05,
+      "loss": 1.4204,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9857349066149346,
+      "learning_rate": 5e-05,
+      "loss": 1.2175,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.8615599929924261,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 1.3035,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7690914837028592,
+      "learning_rate": 0.0001,
+      "loss": 1.2571,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.0401337718522603,
+      "learning_rate": 0.000125,
+      "loss": 1.2383,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.9277157219721284,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.253,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.7611097693761704,
+      "learning_rate": 0.000175,
+      "loss": 1.2298,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.7467727205084197,
+      "learning_rate": 0.0002,
+      "loss": 1.2594,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.6443610988856757,
+      "learning_rate": 0.0001999915737775817,
+      "loss": 1.2015,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7890743435032062,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.1815,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.7765333577639731,
+      "learning_rate": 0.00019992417251814282,
+      "loss": 1.1301,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.7965071182224438,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.1537,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.7348769990385856,
+      "learning_rate": 0.0001997894154323911,
+      "loss": 1.1746,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7643746584209018,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.2597,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.8509186289214853,
+      "learning_rate": 0.0001995873933559535,
+      "loss": 1.0828,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6590660613580283,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 1.1688,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.6908530470091816,
+      "learning_rate": 0.0001993182424657285,
+      "loss": 1.0842,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.777278529092561,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 1.2296,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.6297881039801307,
+      "learning_rate": 0.0001989821441880933,
+      "loss": 1.1193,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.646049560969531,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 1.1809,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.70453563966353,
+      "learning_rate": 0.0001985793250766098,
+      "loss": 1.2028,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6216762423800736,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 1.1419,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.6748615041776291,
+      "learning_rate": 0.00019811005665931205,
+      "loss": 1.1702,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.6462797823043912,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 1.1681,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.6639668928998315,
+      "learning_rate": 0.0001975746552556772,
+      "loss": 1.1807,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.6201202500087949,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 1.0909,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.6179247844982341,
+      "learning_rate": 0.0001969734817634044,
+      "loss": 1.1761,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.6490118886653817,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 1.0643,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.6590708173604165,
+      "learning_rate": 0.00019630694141514464,
+      "loss": 1.1333,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6815264154798509,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 1.1031,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.7390307701064562,
+      "learning_rate": 0.0001955754835053459,
+      "loss": 1.2164,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.6598152894051182,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 1.1077,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.6217047883076768,
+      "learning_rate": 0.0001947796010873974,
+      "loss": 1.0874,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.6260867883739055,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 1.0549,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6505612935212982,
+      "learning_rate": 0.0001939198306412775,
+      "loss": 1.1254,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.6638582395118996,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 1.1729,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.6760355472513383,
+      "learning_rate": 0.0001929967517119289,
+      "loss": 1.1392,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.689419394536968,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 1.1996,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.643046392793243,
+      "learning_rate": 0.0001920109865186052,
+      "loss": 1.2519,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6737071109245297,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 1.1533,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.6041984234865607,
+      "learning_rate": 0.00019096319953545185,
+      "loss": 1.1374,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.6340061670489825,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 1.0871,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.6792970281555901,
+      "learning_rate": 0.00018985409704360456,
+      "loss": 1.138,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.6675168072446115,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 1.0906,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6459838379601136,
+      "learning_rate": 0.00018868442665510678,
+      "loss": 1.1354,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.6654945017142322,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 1.2151,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.6504567057295806,
+      "learning_rate": 0.00018745497680896722,
+      "loss": 1.1807,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.6707619740477763,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 1.1407,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.6838401347017035,
+      "learning_rate": 0.0001861665762396974,
+      "loss": 1.0861,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6408245265616914,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 1.1278,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5920380892978044,
+      "learning_rate": 0.00018482009341868697,
+      "loss": 1.0847,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.6843282752275007,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 1.1665,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.6412113867201495,
+      "learning_rate": 0.00018341643596879367,
+      "loss": 1.0581,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.616003151361979,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 1.1037,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6951255358669098,
+      "learning_rate": 0.00018195655005254273,
+      "loss": 1.1601,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.6429783633007734,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 1.1597,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.6067825702732784,
+      "learning_rate": 0.00018044141973434758,
+      "loss": 1.0717,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.6698768363154343,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 1.1689,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.6217186674790761,
+      "learning_rate": 0.00017887206631718203,
+      "loss": 1.1173,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6508142702807356,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 1.1203,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.588948225712465,
+      "learning_rate": 0.00017724954765415137,
+      "loss": 1.1332,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.6632354263972068,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 1.176,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.7029142534588849,
+      "learning_rate": 0.00017557495743542585,
+      "loss": 1.1699,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.622116683049225,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 1.1055,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6151441614463127,
+      "learning_rate": 0.00017384942445101772,
+      "loss": 1.2301,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.6717386555525499,
+      "learning_rate": 0.000172967916579403,
+      "loss": 1.2032,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.6081467264082907,
+      "learning_rate": 0.00017207411182989832,
+      "loss": 1.1567,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.6204888422023926,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 1.1106,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.6404949562625138,
+      "learning_rate": 0.00017025021625596853,
+      "loss": 1.0956,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.6417814085898819,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 1.1356,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.635016572212586,
+      "learning_rate": 0.0001683789671614107,
+      "loss": 1.1492,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.6308134251683191,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 1.128,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 1.0838315900856792,
+      "learning_rate": 0.00016646162589796615,
+      "loss": 1.0663,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.6529443760516622,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 1.1941,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.6110384281678047,
+      "learning_rate": 0.00016449948488669639,
+      "loss": 1.0774,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.6384046652101805,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 1.1373,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5880440159147562,
+      "learning_rate": 0.00016249386674680184,
+      "loss": 1.0892,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.6052022720463669,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 1.0997,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.8360588585364545,
+      "learning_rate": 0.00016044612340408466,
+      "loss": 1.1554,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6064397572432122,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 1.0517,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.5937622017131812,
+      "learning_rate": 0.00015835763517965673,
+      "loss": 1.1775,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.5858555438053201,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 1.1757,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.6384214991232625,
+      "learning_rate": 0.0001562298098595078,
+      "loss": 1.202,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.6373560727609033,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 1.1889,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.6221210303020743,
+      "learning_rate": 0.00015406408174555976,
+      "loss": 1.0921,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5810311421752556,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 1.0685,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.6574941541121385,
+      "learning_rate": 0.00015186191068884775,
+      "loss": 1.1256,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.6135880637902962,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 1.2139,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.5887150663213712,
+      "learning_rate": 0.00014962478110547918,
+      "loss": 1.2012,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5720216657559922,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 1.1177,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.579631383034893,
+      "learning_rate": 0.0001473542009760343,
+      "loss": 1.1278,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.6048021015601645,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 1.1421,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.5855942323086678,
+      "learning_rate": 0.0001450517008290827,
+      "loss": 1.0698,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5992340797396072,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 1.1308,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.657035857991531,
+      "learning_rate": 0.00014271883270950073,
+      "loss": 1.1656,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.5993609083935862,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 1.1342,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.594031128592101,
+      "learning_rate": 0.00014035716913228568,
+      "loss": 1.0296,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.6343533146102689,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 1.1021,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.6370282993472896,
+      "learning_rate": 0.0001379683020225714,
+      "loss": 1.1423,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5999959123873777,
+      "learning_rate": 0.000136764169663272,
+      "loss": 1.1809,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.6052322798739889,
+      "learning_rate": 0.00013555384164256048,
+      "loss": 1.0853,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.627014952912481,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 1.1041,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.6272955192024172,
+      "learning_rate": 0.00013311541550609565,
+      "loss": 1.1716,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5935607523672076,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 1.1208,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.650271337449353,
+      "learning_rate": 0.00013065466728160252,
+      "loss": 1.1299,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.6508898010705562,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 1.0757,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.6307374194277771,
+      "learning_rate": 0.00012817325568414297,
+      "loss": 1.1246,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5874661015571683,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 1.0583,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5880110073044159,
+      "learning_rate": 0.00012567285335732633,
+      "loss": 1.059,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5927953736082483,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 1.0698,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.6174223850149873,
+      "learning_rate": 0.00012315514574583113,
+      "loss": 1.1004,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6401625589443253,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 1.1198,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.6144835895939935,
+      "learning_rate": 0.00012062182995929882,
+      "loss": 1.0495,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.6160080819037824,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 1.099,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.588716972446015,
+      "learning_rate": 0.0001180746136283638,
+      "loss": 1.0885,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5878309914107679,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 1.0793,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.6828359076557663,
+      "learning_rate": 0.00011551521375359206,
+      "loss": 1.1377,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.6583578970824269,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 1.0516,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.6166318556513858,
+      "learning_rate": 0.00011294535554810354,
+      "loss": 1.0817,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6893920168054586,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 1.1034,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.7653561538457616,
+      "learning_rate": 0.00011036677127465889,
+      "loss": 1.1266,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5801285599368534,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 1.0964,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.6170823725640626,
+      "learning_rate": 0.00010778119907799398,
+      "loss": 1.0843,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.7582907445192895,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 1.1703,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.6260013577906284,
+      "learning_rate": 0.00010519038181318999,
+      "loss": 1.0945,
+      "step": 125
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.5494314613037083,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.7531,
+      "step": 126
+    },
+    {
+      "epoch": 1.016,
+      "grad_norm": 0.5425739308066683,
+      "learning_rate": 0.00010259606587086783,
+      "loss": 0.7805,
+      "step": 127
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.529115377582859,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.7759,
+      "step": 128
+    },
+    {
+      "epoch": 1.032,
+      "grad_norm": 0.5692792123945668,
+      "learning_rate": 0.0001,
+      "loss": 0.7704,
+      "step": 129
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.6186406508253389,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7314,
+      "step": 130
+    },
+    {
+      "epoch": 1.048,
+      "grad_norm": 0.6721041733464441,
+      "learning_rate": 9.740393412913219e-05,
+      "loss": 0.8125,
+      "step": 131
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.6948983333121643,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.7738,
+      "step": 132
+    },
+    {
+      "epoch": 1.064,
+      "grad_norm": 0.6916170510549642,
+      "learning_rate": 9.480961818681004e-05,
+      "loss": 0.782,
+      "step": 133
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 0.6835237897842905,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.7564,
+      "step": 134
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.6864061018600497,
+      "learning_rate": 9.221880092200601e-05,
+      "loss": 0.721,
+      "step": 135
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.6676838293816894,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.7256,
+      "step": 136
+    },
+    {
+      "epoch": 1.096,
+      "grad_norm": 0.6127394586373243,
+      "learning_rate": 8.963322872534114e-05,
+      "loss": 0.6988,
+      "step": 137
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 0.6589588798885051,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8101,
+      "step": 138
+    },
+    {
+      "epoch": 1.112,
+      "grad_norm": 0.6338289492844081,
+      "learning_rate": 8.705464445189647e-05,
+      "loss": 0.7592,
+      "step": 139
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.6681908245671879,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7407,
+      "step": 140
+    },
+    {
+      "epoch": 1.1280000000000001,
+      "grad_norm": 0.6220519972190037,
+      "learning_rate": 8.448478624640797e-05,
+      "loss": 0.6765,
+      "step": 141
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.671093719017714,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7479,
+      "step": 142
+    },
+    {
+      "epoch": 1.144,
+      "grad_norm": 0.6252540781942465,
+      "learning_rate": 8.192538637163621e-05,
+      "loss": 0.69,
+      "step": 143
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.7025608874796816,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.7372,
+      "step": 144
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.8710246282526877,
+      "learning_rate": 7.93781700407012e-05,
+      "loss": 0.7009,
+      "step": 145
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.6957075564392686,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.7039,
+      "step": 146
+    },
+    {
+      "epoch": 1.176,
+      "grad_norm": 0.7236979642981014,
+      "learning_rate": 7.684485425416888e-05,
+      "loss": 0.7541,
+      "step": 147
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.6570340780461766,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.6962,
+      "step": 148
+    },
+    {
+      "epoch": 1.192,
+      "grad_norm": 0.608597796982595,
+      "learning_rate": 7.432714664267373e-05,
+      "loss": 0.6648,
+      "step": 149
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.6274556968757717,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.8017,
+      "step": 150
+    },
+    {
+      "epoch": 1.208,
+      "grad_norm": 0.646045678305703,
+      "learning_rate": 7.182674431585704e-05,
+      "loss": 0.687,
+      "step": 151
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.615025745148123,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.7028,
+      "step": 152
+    },
+    {
+      "epoch": 1.224,
+      "grad_norm": 0.644127006643345,
+      "learning_rate": 6.934533271839752e-05,
+      "loss": 0.6849,
+      "step": 153
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.6322032158257218,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.6858,
+      "step": 154
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.7040220453985434,
+      "learning_rate": 6.688458449390437e-05,
+      "loss": 0.7036,
+      "step": 155
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.7080898961868384,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.6814,
+      "step": 156
+    },
+    {
+      "epoch": 1.256,
+      "grad_norm": 0.6732544034749526,
+      "learning_rate": 6.444615835743955e-05,
+      "loss": 0.6868,
+      "step": 157
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.6197892347959479,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.6852,
+      "step": 158
+    },
+    {
+      "epoch": 1.272,
+      "grad_norm": 0.665156386544487,
+      "learning_rate": 6.203169797742861e-05,
+      "loss": 0.6963,
+      "step": 159
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.6814028088410238,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.7747,
+      "step": 160
+    },
+    {
+      "epoch": 1.288,
+      "grad_norm": 0.6852077562936856,
+      "learning_rate": 5.964283086771435e-05,
+      "loss": 0.6945,
+      "step": 161
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 0.7147333175738334,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7479,
+      "step": 162
+    },
+    {
+      "epoch": 1.304,
+      "grad_norm": 0.6688014606429251,
+      "learning_rate": 5.728116729049928e-05,
+      "loss": 0.6931,
+      "step": 163
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.6029399268866422,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.7594,
+      "step": 164
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.6405186403636638,
+      "learning_rate": 5.4948299170917325e-05,
+      "loss": 0.6861,
+      "step": 165
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.6153553215285271,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.7219,
+      "step": 166
+    },
+    {
+      "epoch": 1.336,
+      "grad_norm": 0.6138403231780742,
+      "learning_rate": 5.26457990239657e-05,
+      "loss": 0.7178,
+      "step": 167
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.594607451880698,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.6499,
+      "step": 168
+    },
+    {
+      "epoch": 1.3519999999999999,
+      "grad_norm": 0.6274961325954719,
+      "learning_rate": 5.0375218894520834e-05,
+      "loss": 0.7387,
+      "step": 169
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.6428236849437015,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.6673,
+      "step": 170
+    },
+    {
+      "epoch": 1.3679999999999999,
+      "grad_norm": 0.7424922487103246,
+      "learning_rate": 4.813808931115228e-05,
+      "loss": 0.7083,
+      "step": 171
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.6413719943869135,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.7688,
+      "step": 172
+    },
+    {
+      "epoch": 1.384,
+      "grad_norm": 0.6447719371076412,
+      "learning_rate": 4.593591825444028e-05,
+      "loss": 0.7062,
+      "step": 173
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.6850728968609752,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.7522,
+      "step": 174
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.655257297593665,
+      "learning_rate": 4.377019014049223e-05,
+      "loss": 0.7415,
+      "step": 175
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.6649918375536736,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.7079,
+      "step": 176
+    },
+    {
+      "epoch": 1.416,
+      "grad_norm": 0.640170590584303,
+      "learning_rate": 4.164236482034327e-05,
+      "loss": 0.7246,
+      "step": 177
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.6797620642919888,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.6517,
+      "step": 178
+    },
+    {
+      "epoch": 1.432,
+      "grad_norm": 0.680554997205006,
+      "learning_rate": 3.9553876595915375e-05,
+      "loss": 0.6751,
+      "step": 179
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.6510525823177645,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7289,
+      "step": 180
+    },
+    {
+      "epoch": 1.448,
+      "grad_norm": 0.6681387989836439,
+      "learning_rate": 3.750613325319817e-05,
+      "loss": 0.6989,
+      "step": 181
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.682820922478662,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.6846,
+      "step": 182
+    },
+    {
+      "epoch": 1.464,
+      "grad_norm": 0.6669747221606931,
+      "learning_rate": 3.550051511330361e-05,
+      "loss": 0.7159,
+      "step": 183
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.6346245539679889,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.722,
+      "step": 184
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.6860528744206814,
+      "learning_rate": 3.3538374102033866e-05,
+      "loss": 0.7387,
+      "step": 185
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.6466559324864782,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.6739,
+      "step": 186
+    },
+    {
+      "epoch": 1.496,
+      "grad_norm": 0.6023284293992224,
+      "learning_rate": 3.1621032838589305e-05,
+      "loss": 0.6686,
+      "step": 187
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.6413527159496256,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.748,
+      "step": 188
+    },
+    {
+      "epoch": 1.512,
+      "grad_norm": 0.6551109522736579,
+      "learning_rate": 2.974978374403147e-05,
+      "loss": 0.6868,
+      "step": 189
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.6326824311431275,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.7515,
+      "step": 190
+    },
+    {
+      "epoch": 1.528,
+      "grad_norm": 0.6385233894295355,
+      "learning_rate": 2.7925888170101665e-05,
+      "loss": 0.7054,
+      "step": 191
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 0.6662115965676328,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.7029,
+      "step": 192
+    },
+    {
+      "epoch": 1.544,
+      "grad_norm": 0.6591688636277143,
+      "learning_rate": 2.6150575548982292e-05,
+      "loss": 0.6458,
+      "step": 193
+    },
+    {
+      "epoch": 1.552,
+      "grad_norm": 0.6936966434138461,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.7114,
+      "step": 194
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.660940431444361,
+      "learning_rate": 2.4425042564574184e-05,
+      "loss": 0.7331,
+      "step": 195
+    },
+    {
+      "epoch": 1.568,
+      "grad_norm": 0.6364962057855832,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7192,
+      "step": 196
+    },
+    {
+      "epoch": 1.576,
+      "grad_norm": 0.6560039547156613,
+      "learning_rate": 2.2750452345848682e-05,
+      "loss": 0.7001,
+      "step": 197
+    },
+    {
+      "epoch": 1.584,
+      "grad_norm": 0.6585253364363486,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7239,
+      "step": 198
+    },
+    {
+      "epoch": 1.592,
+      "grad_norm": 0.624825451719301,
+      "learning_rate": 2.112793368281799e-05,
+      "loss": 0.6901,
+      "step": 199
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.6649961244159159,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.699,
+      "step": 200
+    },
+    {
+      "epoch": 1.608,
+      "grad_norm": 0.6615988192210411,
+      "learning_rate": 1.9558580265652448e-05,
+      "loss": 0.731,
+      "step": 201
+    },
+    {
+      "epoch": 1.616,
+      "grad_norm": 0.669000346973526,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.7515,
+      "step": 202
+    },
+    {
+      "epoch": 1.624,
+      "grad_norm": 0.6625424275293328,
+      "learning_rate": 1.804344994745727e-05,
+      "loss": 0.7279,
+      "step": 203
+    },
+    {
+      "epoch": 1.6320000000000001,
+      "grad_norm": 0.6172627479461968,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.6609,
+      "step": 204
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.6968471322847645,
+      "learning_rate": 1.6583564031206357e-05,
+      "loss": 0.6419,
+      "step": 205
+    },
+    {
+      "epoch": 1.6480000000000001,
+      "grad_norm": 0.6646091376538787,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.7074,
+      "step": 206
+    },
+    {
+      "epoch": 1.6560000000000001,
+      "grad_norm": 0.7119786340374479,
+      "learning_rate": 1.5179906581313064e-05,
+      "loss": 0.7104,
+      "step": 207
+    },
+    {
+      "epoch": 1.6640000000000001,
+      "grad_norm": 0.6182072495204601,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.6686,
+      "step": 208
+    },
+    {
+      "epoch": 1.6720000000000002,
+      "grad_norm": 0.6571653192714902,
+      "learning_rate": 1.3833423760302611e-05,
+      "loss": 0.6878,
+      "step": 209
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.6618007770830334,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.6461,
+      "step": 210
+    },
+    {
+      "epoch": 1.688,
+      "grad_norm": 0.6817305234712389,
+      "learning_rate": 1.2545023191032801e-05,
+      "loss": 0.6321,
+      "step": 211
+    },
+    {
+      "epoch": 1.696,
+      "grad_norm": 0.6704808086973233,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.7268,
+      "step": 212
+    },
+    {
+      "epoch": 1.704,
+      "grad_norm": 0.6185324770527085,
+      "learning_rate": 1.131557334489326e-05,
+      "loss": 0.7113,
+      "step": 213
+    },
+    {
+      "epoch": 1.712,
+      "grad_norm": 0.6764723338003922,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.6954,
+      "step": 214
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.6529991387340686,
+      "learning_rate": 1.0145902956395447e-05,
+      "loss": 0.6599,
+      "step": 215
+    },
+    {
+      "epoch": 1.728,
+      "grad_norm": 0.6470487796656074,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.6545,
+      "step": 216
+    },
+    {
+      "epoch": 1.736,
+      "grad_norm": 0.7156374080738279,
+      "learning_rate": 9.036800464548157e-06,
+      "loss": 0.7424,
+      "step": 217
+    },
+    {
+      "epoch": 1.744,
+      "grad_norm": 0.6628825567627252,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.64,
+      "step": 218
+    },
+    {
+      "epoch": 1.752,
+      "grad_norm": 0.6679173882701385,
+      "learning_rate": 7.989013481394814e-06,
+      "loss": 0.7379,
+      "step": 219
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.6590707065664086,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.5963,
+      "step": 220
+    },
+    {
+      "epoch": 1.768,
+      "grad_norm": 0.6444345838715957,
+      "learning_rate": 7.003248288071118e-06,
+      "loss": 0.7151,
+      "step": 221
+    },
+    {
+      "epoch": 1.776,
+      "grad_norm": 0.675515451229952,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.6899,
+      "step": 222
+    },
+    {
+      "epoch": 1.784,
+      "grad_norm": 0.7208271256227047,
+      "learning_rate": 6.08016935872251e-06,
+      "loss": 0.592,
+      "step": 223
+    },
+    {
+      "epoch": 1.792,
+      "grad_norm": 0.6543039778126106,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.725,
+      "step": 224
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.6545834113447834,
+      "learning_rate": 5.22039891260262e-06,
+      "loss": 0.5796,
+      "step": 225
+    },
+    {
+      "epoch": 1.808,
+      "grad_norm": 0.6413464222219671,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.6239,
+      "step": 226
+    },
+    {
+      "epoch": 1.8159999999999998,
+      "grad_norm": 0.6409389432940975,
+      "learning_rate": 4.424516494654118e-06,
+      "loss": 0.6732,
+      "step": 227
+    },
+    {
+      "epoch": 1.8239999999999998,
+      "grad_norm": 0.6130780753603059,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.7106,
+      "step": 228
+    },
+    {
+      "epoch": 1.8319999999999999,
+      "grad_norm": 0.6436868508715743,
+      "learning_rate": 3.693058584855369e-06,
+      "loss": 0.713,
+      "step": 229
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.644610773822548,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.6644,
+      "step": 230
+    },
+    {
+      "epoch": 1.8479999999999999,
+      "grad_norm": 0.6742463101315687,
+      "learning_rate": 3.026518236595621e-06,
+      "loss": 0.7132,
+      "step": 231
+    },
+    {
+      "epoch": 1.8559999999999999,
+      "grad_norm": 0.623708408040731,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.708,
+      "step": 232
+    },
+    {
+      "epoch": 1.8639999999999999,
+      "grad_norm": 0.6353495695076489,
+      "learning_rate": 2.4253447443228106e-06,
+      "loss": 0.6728,
+      "step": 233
+    },
+    {
+      "epoch": 1.8719999999999999,
+      "grad_norm": 0.6470934000333828,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.6684,
+      "step": 234
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.6339743329664256,
+      "learning_rate": 1.8899433406879608e-06,
+      "loss": 0.7162,
+      "step": 235
+    },
+    {
+      "epoch": 1.888,
+      "grad_norm": 0.6991547215955025,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.6655,
+      "step": 236
+    },
+    {
+      "epoch": 1.896,
+      "grad_norm": 0.6573698643261111,
+      "learning_rate": 1.4206749233902084e-06,
+      "loss": 0.6264,
+      "step": 237
+    },
+    {
+      "epoch": 1.904,
+      "grad_norm": 0.674748855909925,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.6998,
+      "step": 238
+    },
+    {
+      "epoch": 1.912,
+      "grad_norm": 0.6717822723070102,
+      "learning_rate": 1.0178558119067315e-06,
+      "loss": 0.6694,
+      "step": 239
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.6005919528342801,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.6788,
+      "step": 240
+    },
+    {
+      "epoch": 1.928,
+      "grad_norm": 0.6588728502179835,
+      "learning_rate": 6.817575342714988e-07,
+      "loss": 0.6907,
+      "step": 241
+    },
+    {
+      "epoch": 1.936,
+      "grad_norm": 0.6399991839714124,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.7308,
+      "step": 242
+    },
+    {
+      "epoch": 1.944,
+      "grad_norm": 0.6727098433398869,
+      "learning_rate": 4.126066440464982e-07,
+      "loss": 0.7206,
+      "step": 243
+    },
+    {
+      "epoch": 1.952,
+      "grad_norm": 0.6453747666541074,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.6922,
+      "step": 244
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.6826025782964477,
+      "learning_rate": 2.1058456760891798e-07,
+      "loss": 0.6359,
+      "step": 245
+    },
+    {
+      "epoch": 1.968,
+      "grad_norm": 0.6759076178003499,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7169,
+      "step": 246
+    },
+    {
+      "epoch": 1.976,
+      "grad_norm": 0.6557247806930547,
+      "learning_rate": 7.582748185719358e-08,
+      "loss": 0.662,
+      "step": 247
+    },
+    {
+      "epoch": 1.984,
+      "grad_norm": 0.6905158594933963,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.639,
+      "step": 248
+    },
+    {
+      "epoch": 1.992,
+      "grad_norm": 0.6703768445324131,
+      "learning_rate": 8.426222418311814e-09,
+      "loss": 0.7734,
+      "step": 249
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.6528927891706773,
+      "learning_rate": 0.0,
+      "loss": 0.6719,
+      "step": 250
+    },
+    {
+      "epoch": 2.0,
+      "step": 250,
+      "total_flos": 130603858001920.0,
+      "train_loss": 0.9226167418956757,
+      "train_runtime": 3038.6804,
+      "train_samples_per_second": 1.316,
+      "train_steps_per_second": 0.082
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 130603858001920.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/README.md b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec7b01dfe7159d02d5d955d904fbc05f76c827ed
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..23df97ed34c2c1ca2c7159df525227430495f255
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e94296acb549b80024e300c9a323b71ee3077e46a71b5a7f17b0f989114a6e5
+size 671150064
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b3922dea144ee16eb1ca5d1074aff938ef4e53d
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd0a8733639c356cd85e403d0c948297f862e3bc719bcb1be355d5aa0fb94d57
+size 918507402
diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f13aa3a9346a36954edbf3382959979c913e948
--- /dev/null
+++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8934607401720872,
+      "learning_rate": 2e-05,
+      "loss": 1.3726,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9702764908035759,
+      "learning_rate": 4e-05,
+      "loss": 1.3514,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8928671122403777,
+      "learning_rate": 6e-05,
+      "loss": 1.3464,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.733085596985154,
+      "learning_rate": 8e-05,
+      "loss": 1.2532,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8570494872495142,
+      "learning_rate": 0.0001,
+      "loss": 1.1028,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.0400527883080488,
+      "learning_rate": 0.00012,
+      "loss": 1.2828,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8428812825744051,
+      "learning_rate": 0.00014,
+      "loss": 1.2222,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7939623442508198,
+      "learning_rate": 0.00016,
+      "loss": 1.2832,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6664596962036828,
+      "learning_rate": 0.00018,
+      "loss": 1.2148,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7655774280979413,
+      "learning_rate": 0.0002,
+      "loss": 1.2593,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.9002584907773491,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 1.2274,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.6973696058498797,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.1777,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.7961500621219396,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 1.2384,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6934442208162789,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 1.1529,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7760808007197953,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 1.1961,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.7662958744276087,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 1.2431,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.7492875679562371,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 1.1853,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.6644857894549516,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 1.0896,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.7028291596766432,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 1.1517,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.7151346253989167,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 1.2471,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6597569217376018,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 1.1051,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.7876128410548371,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 1.1344,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.6788178829714944,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 1.1579,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.6959667855216832,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 1.0935,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6360283110594259,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 1.0861,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.7149481186935083,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 1.2315,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.7290306737510642,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 1.1464,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.6106941074968636,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 1.165,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.6171080416329583,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 1.1248,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6951443407479725,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 1.1997,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.7110530520599219,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 1.0452,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.6720320155074625,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 1.1767,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.6247290543001629,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 1.1555,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.6768489593434474,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 1.2408,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5850076189172989,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 1.1552,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.6272578992853955,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 1.1843,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.6041727331165989,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 1.111,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5853996034242491,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 1.2139,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.6044406014017014,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 1.1251,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.599638159597199,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 1.1915,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.6230215019985648,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 1.1465,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.6685570571892,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 1.1318,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.6591831305025555,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 1.1549,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.6302659422826664,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 1.1936,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6551385711236554,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 1.1421,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.6846744761544914,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 1.1853,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5995584235958642,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 1.1172,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.6693072383155553,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 1.0899,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5823860976342665,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 1.0778,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6548607686911667,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 1.1292,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.6455229915546238,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 1.2137,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.6797448562922372,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 1.1747,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.6169579402663269,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 1.1282,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5752462685930675,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 1.2357,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6524878494461898,
+      "learning_rate": 0.000189241899082916,
+      "loss": 1.1178,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5803880758055884,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 1.0582,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.6284049891526231,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 1.1503,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.6429933521925236,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 1.1227,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.6642315948614433,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 1.1491,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.6318270344993646,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 1.0924,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.6736013814160521,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 1.1376,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.6290487314099203,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 1.1099,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5868150325072425,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 1.0572,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.6287361571659641,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 1.1199,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.635478784306169,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 1.1375,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.6934699927576387,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 1.12,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6300333641647747,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 1.1237,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.603806858704276,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 1.1252,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.6328663939650221,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 1.1643,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.6587939462250295,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 1.159,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.6554856961720463,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 1.0889,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.7008545104931168,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 1.1307,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.6109951811360471,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 1.1416,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5813378104951092,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 1.0637,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5909823532691317,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 1.1533,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5928182395354943,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 1.1322,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.6569884577681215,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 1.1574,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.6278707831640676,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 1.087,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.640992980278925,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 1.1207,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.668359710460776,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 1.143,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.6285654698383347,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 1.1844,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.641766909272507,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 1.1131,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.6393224242059528,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 1.1277,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.6079741134716099,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 1.139,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5889968838496492,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 1.1896,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.635708382670474,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 1.1767,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.6347494253926971,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 1.0919,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.6081714242856907,
+      "learning_rate": 0.00016884803286362,
+      "loss": 1.002,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.6687524144949784,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 1.1217,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.6367497368719929,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 1.1734,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.6924335068178338,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 1.1005,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.6407334042780027,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 1.1806,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.6831640002730881,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 1.1643,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.6818497163627971,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 1.1479,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.6023164784286884,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 1.0199,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.6071162038895305,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 1.1268,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.6621975947969588,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 1.1054,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.6430349596453502,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 1.185,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.596716232396584,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 1.1167,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.8030431208767496,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 1.0043,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.6322736824639741,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 1.1156,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.9683388132336448,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 1.0974,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5821355886938816,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 1.1224,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.6319431789381217,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 1.163,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.6020869145562134,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 1.1574,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.581552295038906,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 1.0393,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.6434562480401083,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 1.1169,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.6505539891186463,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 1.1125,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.6143461053823742,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 1.1756,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5678824592957399,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 1.1602,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.7196907483605643,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 1.1461,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.60872627866381,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 1.0779,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.6088948095827803,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 1.1465,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.6001452511808347,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 1.0806,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.7237162644452426,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 1.1681,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.6361290323952742,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 1.1262,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6042846124009682,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 1.094,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.6026735908383196,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 1.031,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.5813635120693075,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 1.0934,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.6607505994262415,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 1.1583,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5797413168500228,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 1.119,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.6212414913681146,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 1.1696,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5874185714602815,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 1.098,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.6030500726178164,
+      "learning_rate": 0.000137546377942393,
+      "loss": 1.0709,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5515883025449129,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 1.0873,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.6617058119593494,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 1.0748,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.6044340009807571,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 1.1109,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.6280824205414822,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 1.1219,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.6023589276763569,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 1.0416,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.601503782550989,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 1.1202,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5947810021728166,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 1.0489,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5960541379516832,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 1.0828,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.6301040459235665,
+      "learning_rate": 0.000128717230790931,
+      "loss": 1.1456,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.6218790112688984,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 1.1443,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.6331790582677411,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 1.1512,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.6920164094640866,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 1.0705,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.6457611049577008,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 1.108,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.5851148684445352,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 1.1249,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.5616442882431073,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 1.173,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.6029147306217549,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 1.0922,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.5983185759853441,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 1.0569,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.6459305861705162,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 1.1201,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.6413251599078064,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 1.0949,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5567459691512033,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 1.1077,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.6050063276858223,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 1.0871,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.5768000953188527,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 1.0336,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.58354353504979,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 1.109,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6239680842671043,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 1.1406,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.5639619023664048,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 1.1355,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.5994597322729686,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.9949,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.6187766287679105,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 1.123,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5880933746187075,
+      "learning_rate": 0.000109348690758,
+      "loss": 1.0811,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5214334484876151,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 1.04,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.6045346113069798,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 1.1112,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5972215634171124,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 1.1067,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5862603509263139,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 1.0297,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.589645718660567,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 1.0336,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5966492770502331,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 1.0866,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.638493351015436,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 1.0765,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.6208601412709848,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 1.1046,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.7456812775232595,
+      "learning_rate": 0.0001,
+      "loss": 0.9999,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.6175846317153577,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 1.0644,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6714016144368657,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 1.0637,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.5837880916678994,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 1.1408,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5530213168075825,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 1.1138,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.6270056310312441,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 1.2315,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.5561192764658761,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 1.0632,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.6073371369069578,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 1.11,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5944081760932085,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 1.048,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5848610084940267,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 1.1179,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5971797062090918,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 1.0489,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.6027497233577513,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 1.1686,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.5842512985502563,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 1.1134,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5790768255854168,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 1.0993,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.6244985783771545,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 1.1102,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.6363066079679679,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 1.1054,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5758171908775908,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 1.1669,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.5340947294270786,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 1.1309,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5831665436537086,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 1.1248,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5921556048685298,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 1.112,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5744251424245944,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 1.022,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5554185323336767,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 1.0337,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5743450858728842,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 1.0235,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.5602361322464704,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 1.1312,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5954682352280739,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 1.1366,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.6109690367869244,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 1.0795,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.564705324519998,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 1.0559,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5998310026704221,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 1.0471,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.5923598358823705,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 1.0805,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5837508570039001,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 1.0486,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5993515803964782,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 1.0758,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.5640431095353253,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 1.1195,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.6597258202025034,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 1.1206,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.5700867646836549,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 1.028,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5438950021648811,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 1.0901,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5895697124383904,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 1.003,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5531669590622527,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.9802,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.5634663273360929,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 1.0221,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.5852062381389797,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 1.16,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.5937242795441198,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 1.0475,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5555566589467784,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 1.0439,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.585204914236333,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 1.0635,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.574624666649916,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 1.1461,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5895005838586477,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 1.0533,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.568531621396431,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 1.0117,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5713855868879502,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 1.0869,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.533463283372751,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 1.0669,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.596191800876185,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 1.0889,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.6033094935134793,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 1.0037,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.547758452785834,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 1.0326,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.5674015427230111,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 1.0642,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5564959453988821,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 1.1004,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5510907797932991,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 1.0383,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5364931725607687,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 1.0421,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5494121878521853,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 1.0529,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5363726515242464,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 1.0332,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5465442810140333,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 1.0314,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.5739488758444642,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 1.0601,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.5879132221028294,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 1.0442,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.6045995523657305,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 1.0801,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.5617927988946498,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 1.0536,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.7006136765663606,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 1.1709,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5519226190976758,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 1.0035,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5722678337036814,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 1.0321,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.546356073899012,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 1.063,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5572189973946577,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 1.0493,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5609326202853951,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 1.0401,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.5573693239077621,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 1.0633,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.6120271145151069,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 1.0247,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5556112139031892,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 1.0004,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5442180530300604,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 1.0231,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.5642744910174119,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 1.0598,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.5301873588771624,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 1.0609,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5270404145127983,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 1.1029,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5792631918257901,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 1.0148,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.5870979627317701,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 1.0717,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5785722600525429,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 1.0672,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.5709981460824844,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 1.1185,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.6217871274009109,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 1.0541,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.5502163111083991,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 1.037,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.6939068470167971,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.9026,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.5348805571120492,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 1.0644,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5851088907445847,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 1.0135,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.5593559807902603,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 1.0758,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5203188462905168,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 1.0303,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5605154105634976,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 1.0211,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.5664827723584672,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 1.0348,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.5900057315119912,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 1.0552,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5381277090688744,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 1.0548,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5710250024280842,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 1.0525,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5247596985036673,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 1.0817,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.5154307520162806,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 1.0125,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.6337614103760363,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 1.0949,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4957665878310061,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 1.0451,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5308197423642265,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 1.0212,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.6283452572668032,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 1.0286,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.5277570848309738,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 1.0517,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.572380991818581,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 1.052,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5459522732962143,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 1.0256,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5818244377960141,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 1.0306,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5757150906330832,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 1.0161,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5612123216943989,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.9974,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.5361229678609986,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 1.054,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.5826837681269577,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 1.0603,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5402287917678977,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 1.0676,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5155809923846763,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 1.1004,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.6178311815438159,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 1.0745,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5804058648418592,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 1.0794,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.6096364520049162,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 1.008,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5638671810187824,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 1.059,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.551134037205924,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 1.0661,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5476576939761192,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 1.0599,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.5618097858382183,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 1.128,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.5545956623834618,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 1.0403,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5585106863409853,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 1.0839,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.5777158619246178,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 1.1582,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.5312338291560674,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 1.1125,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.6019089227782868,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 1.0226,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.5385907521451545,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 1.0428,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5929711954637248,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 1.0584,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5352372798399714,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 1.0398,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.533249413457343,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.9853,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.5325098845257755,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 1.1217,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.544616418598338,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 1.0499,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5389805914927016,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 1.0955,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5992479750486914,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 1.0344,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.5563032838951335,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 1.1259,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5344650297317594,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 1.1222,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.6026066762936244,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 1.048,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5380734238182887,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 1.0735,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.539269851195273,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 1.0271,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5101331087182588,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 1.0877,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.5260284887574873,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 1.1117,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.6165360335006401,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 1.1343,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.518362085552097,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 1.0402,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.569371845816852,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 1.0454,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.5281615447976709,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 1.1065,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.6341864106370607,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 1.0237,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5471729782472513,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 1.1344,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.521375171737115,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 1.0338,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.565578997311788,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 1.0667,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.5650391103593894,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 1.0766,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5820265479098684,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 1.0305,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.5926416552467474,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.9986,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5397571968572117,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.986,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5508600370370091,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 1.1161,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.5460513088356049,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 1.0266,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.5604567327919951,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.9579,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.5383144642509582,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 1.0491,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5254302419934068,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 1.0394,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5885982889343618,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 1.0553,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.6364946352079829,
+      "learning_rate": 0.0,
+      "loss": 1.0507,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 163148489719808.0,
+      "train_loss": 1.1001117495008004,
+      "train_runtime": 3775.7928,
+      "train_samples_per_second": 1.324,
+      "train_steps_per_second": 0.083
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 163148489719808.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9547b55d0b677450ba90e79425a03c5e60fa7cde
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..41f4ce71f4849e3338f2f68dc7ef2e76fcf7fce3
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c1ca96a72a93c29ee37109b9231a208a153b02842d8988648783c09ec6b80cf
+size 671150064
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..116dc51c700b5ebf12e85de8035d6d578066c895
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7942b3ff9fdd57de7917cc18505f63252b815f5b36cbf3fe2a9df7d46706f464
+size 918507402
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..781b8168ef9f50ae8debcd64dd418c2d0a4c94ea
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.229373798278603,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.5995,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.3255378585572952,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.613,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 1.1225274609868587,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.5345,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9601904040026625,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.4265,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.9802642339973807,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.2977,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.068848268073717,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.2638,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.0871580440341346,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1175,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9068610495921913,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.0853,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.9811047071816428,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9156,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8447891281203188,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9022,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.7461951662582579,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9529,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6177951220259487,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8863,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5348190521442691,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8834,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5413077228307152,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8928,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.537046673330067,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8687,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5002112492477047,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.811,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5366964322274403,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8333,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5014078956773536,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8687,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.5252222393318335,
+      "learning_rate": 0.0002,
+      "loss": 0.8638,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5373932752091392,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.9097,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4762690764333146,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8766,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.4568017154233179,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7836,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.5001790554030369,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8819,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4686071769810681,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8376,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.3951476828707915,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.7475,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4204532563793381,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8023,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.43605750796653514,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7777,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4312291066927483,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8084,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.4422682479152776,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8133,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4877596727466243,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.7581,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.42662048000259273,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.7395,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.47553941708746517,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8115,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4373444242596017,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.7637,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.46828037568677117,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8175,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.43057672078604997,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7972,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4162655730684327,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7981,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.40732518436942894,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7856,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.40951222619704847,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7895,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4291197421264731,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7863,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4401239448447159,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7659,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.39828178872140096,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7714,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4107568482589156,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7668,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4113529823060761,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7933,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.42928009327160627,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.7457,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.44243052895313983,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7841,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4063400800613472,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7806,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4205339635927461,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7937,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.3994926519851495,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7852,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.42462911990010227,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8006,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4000315027275353,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7605,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.39076661860395445,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7824,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.41280970200617884,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7927,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.436430398186534,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7576,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.46076369353240937,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7212,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4040674327666037,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7347,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4101966638057085,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7761,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.42413278233874757,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.7647,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.42302978595377466,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7519,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.40006395047526183,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7763,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.41306878173353634,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.786,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4032602892912269,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.7688,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3892433009854969,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7553,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4004143313998048,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7814,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4183210940841329,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7779,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.42247226472149213,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8057,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4195024410862326,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7776,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.40403657134063525,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7967,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.38704775081201337,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7602,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.4023423479962683,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7905,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.38501121315792136,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7468,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4091701530186358,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7996,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.36355727696105633,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7519,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.47848552592213617,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7495,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.39357553915086346,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7767,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.3957171032861638,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7751,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3891948568622778,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.6961,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.3920291011646516,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8146,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.36170578672528475,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7712,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.3848155321538925,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7489,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.39347713045374183,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7946,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.3821383752069123,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7531,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3667234454853835,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8051,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.3932102128704723,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7757,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.38616189039631826,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7179,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4001982588795188,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7891,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.3891513932660926,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.769,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.42116323271475387,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8356,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.37430938000545133,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7141,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.3986562727168721,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7183,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3968462925532793,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7722,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.3936868026019858,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7625,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3953996953277331,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.736,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.40725778287562764,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7957,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3956730097996287,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7145,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.3856174409454427,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7535,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.397956094443797,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7729,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.36657234780092485,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7173,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.40634460482101714,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7646,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.366949505480132,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7345,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3809635372006216,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7884,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.43363867156407665,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8288,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3720900596896555,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.6943,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.7634072094055883,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.6921,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3816278455124801,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7996,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4097928192635754,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7669,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.41740583671897846,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7738,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.39027372813301175,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7115,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3964970065919041,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7536,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4118659445973478,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7963,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.37631494230013696,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7276,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.38303900808802116,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7686,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4617088675955906,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7056,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.4007475923943109,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7646,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.42668599104118365,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7578,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.48137065952465374,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.6984,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.37822476605881156,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.6791,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.3901709397441698,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7179,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.37130511642749203,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.6907,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.402217274699389,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7473,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4188277958180444,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7372,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.38853389068801125,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.6853,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4169792734028954,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7119,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.38595621654016343,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.6993,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3900268823974102,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7333,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.36289349941433785,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7228,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3704974542399799,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7049,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.38860747392520834,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8067,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.35710911077277013,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7114,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.3593712961722801,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7564,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.39059526056982474,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.6863,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.3649404379752295,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7655,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.41483604598870905,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7418,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.40122957922475827,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7252,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.38125659969112374,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7394,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.39193173095160444,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7334,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3852015284724536,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7274,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.3853228321924374,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7309,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3859003592587561,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7459,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.3955167140395012,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7377,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3849858979613698,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7148,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.3827161665380854,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7162,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.38274975285166835,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7363,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4511367752683735,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7714,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3929031484028961,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7105,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3909829530272507,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.76,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4005882700079233,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7263,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4242225472595463,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7437,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.38146844799520946,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7018,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.41123976073579627,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7277,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3645878348641131,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.6988,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.3716361961263643,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7289,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3863591358843344,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.6796,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3755277313037591,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7121,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3772428656770563,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7327,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.3863558598033965,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7667,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3700798736664158,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7297,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3625692215921915,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.6769,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.39900910764233755,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.73,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.35824773081382993,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.6944,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.36142203894300484,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.678,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.3975821452535094,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7197,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3936341912600741,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7609,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.364201706851904,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7399,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.37875567954037376,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7793,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.360667351795953,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6545,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3902385173718295,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7456,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.39398866659150356,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.735,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3743528690845882,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.778,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.39915123353703374,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.6924,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.38414665513613727,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7476,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.3812020107036411,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7083,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.38019590830017097,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7413,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.3916000729224414,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7144,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.39835029355072454,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7183,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.39104548258367283,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6871,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.38502788895335127,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7586,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.42484728564668284,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7384,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.39800478681057694,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.6594,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.3972473284811008,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7076,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3836320462207601,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7248,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.3773777910389116,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.6984,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4507746764915864,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.772,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.46866078541687584,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7244,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4422461097872862,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.697,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4121388436172653,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7225,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.43545976521929347,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7725,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.3804683222868557,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.6659,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.41487982913028726,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7619,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.38962666625247283,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.6597,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4228344639001,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.707,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.42015839182108944,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7653,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.37683110692755134,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7252,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.3772024054374649,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.6891,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4195457931474166,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7618,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.3720664397945769,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.6919,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3740925748103595,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.6873,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.37428951513062436,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6701,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.38121318450586267,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6679,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.38597591689559313,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.6978,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.39397945622113667,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7049,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.38176167402393113,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7353,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3670506697529049,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7342,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.3892830499788043,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7281,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.36666548505760843,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7122,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3824410040805799,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7042,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.36673613420740625,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6759,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3757664044110948,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7035,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.39532812560975455,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7232,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.40438099505972785,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7594,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3419577876429092,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.6602,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.36246576871660047,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7293,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3588255354877175,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7594,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.3901441159572394,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7491,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.35719437308085233,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.6476,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.364700995492638,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7183,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3784135273383183,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7033,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.3749000315840051,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7007,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.38800053532610096,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7138,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3418098446663238,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.6944,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.38111268593685843,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6762,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.3710500845514271,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6591,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.37441467488393976,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7269,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.39654706299414744,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7242,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3568180401154739,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.6911,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.39034288279985846,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7557,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.34557279768441856,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7082,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.35705484794926023,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7241,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.35798628411050953,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6318,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.36136379085752796,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.6821,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.37618932987528225,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7047,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.38816240044304784,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6785,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.38762381736859697,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7498,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.37991677053794676,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7418,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.35899697440611383,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7145,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.3855068948533226,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.8137,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3640340840565266,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.687,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.3813513523794655,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7193,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3884503295790077,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7276,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3591527170010043,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6579,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.34340461207851425,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.6988,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3716142230619113,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.6837,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3746943540983614,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7138,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.6149858253919029,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.6912,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3694845102751698,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.6709,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.36790318218261714,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.6941,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.38696101358451285,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7065,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.3626054966593918,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.6974,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.36810130417217946,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7059,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.3624097418077036,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7229,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.34599108827247227,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7048,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3879058306480498,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7924,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.34023836681670694,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7157,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.3711063338350221,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7521,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.36472198648465,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6572,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.3576711587325934,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.6568,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.37710702613417896,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7111,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3793009735708133,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7324,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3728235770012968,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7258,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.41141037106287126,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7106,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3686623259505453,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.6813,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.3520653468975216,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.6652,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.37518104968705895,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7043,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.3663522013676943,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.695,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.350137374488117,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7243,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3600436082512767,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7378,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.368533068906102,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7028,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.36198729400227725,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6707,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.357610238818022,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6861,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.35665899603087337,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6905,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3595715382711654,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7023,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.3339131480094967,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6807,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.35554457901474196,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6922,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3461905884208761,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7075,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.35885451438850346,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7204,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3787323699405888,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6887,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.34410493352577043,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6943,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.35672201920563706,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6677,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.35647172367608165,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6617,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.34872758802754317,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6522,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3488974339691185,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7242,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.35533726973414664,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7126,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3665197508720917,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7104,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3901613865642856,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7022,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3680636194883015,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6206,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.36205230764978635,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6846,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3700102223107961,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6834,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.3707852248575377,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.73,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3328396952074698,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6326,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3647275618298019,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6903,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.37522903402817953,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7148,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.3459570022247,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.6901,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3453472634165083,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6541,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.46858167069260703,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.6474,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3425009900081892,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6787,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.3408371432957332,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.6421,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.34309203175466485,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6605,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3366472364937924,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6766,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3675673823200964,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.716,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.3624885760394247,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6831,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.34485269464743384,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6894,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.36363641721437323,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7037,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3804302820151281,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7362,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.34617907037291995,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6751,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.38976971548297357,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6566,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.35106778892349916,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6725,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3514557744429845,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.656,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.3546450466677947,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6563,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.34723925989473264,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6996,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3285706810054756,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6848,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.339283492277106,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6901,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.33349436629549434,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6477,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.366636642059819,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6929,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.34361329665130813,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6612,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.33892463526483174,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6741,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.34973460117012567,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.601,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3565085399782329,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.6666,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3418140611458426,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6594,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.37195336389240935,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6845,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.35110530974078485,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6592,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.36489174363833043,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6779,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4171223297688789,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6283,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.37413927100291494,
+      "learning_rate": 0.0001,
+      "loss": 0.6831,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3528735973381612,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6644,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3745336489378991,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6503,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.37275093455288594,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6984,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3366978224997515,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.667,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.3564685656104209,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6582,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.35941780232957743,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6493,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.36411996751195624,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6512,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.35889993632067707,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6817,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3413373464186135,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6057,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3371052874072895,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6546,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.34015871442448126,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7548,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.34768381176226404,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6249,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.35055208678570654,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6609,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3696407680595258,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7256,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3443399204481053,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6692,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.36476030873795856,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6694,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3554648289200159,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6894,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3434710885105751,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7328,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3503197379585682,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6687,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.34374188690838525,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6637,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.34634204748549424,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6231,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3448828690040321,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6761,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.32290933438854713,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.619,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.34523451534574523,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6732,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3396007317968865,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6939,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3400300609137072,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6334,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.38617639543364063,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.71,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3449820516474646,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6525,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3736688125800989,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6632,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.33245275428669596,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6842,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.350824463531681,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6891,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3769679328839366,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6448,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3950956861477466,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7028,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3769715248178555,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7524,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3550864279693727,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6956,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.45579727504565914,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6863,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.33662953960341585,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7046,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.34022215506401343,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6133,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.32991494219957185,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6001,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.37097329614451063,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6345,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.3515701273892897,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6209,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3390859701302974,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6725,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3261614696438565,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6474,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.35231998430218436,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6356,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.34487852889553167,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6755,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3597076481941233,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7348,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3535979886913036,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6586,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.34056561461249657,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6566,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3313840825869037,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6265,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.356858938714993,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6765,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.36345182693995554,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6972,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3600146037302109,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6791,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.34190431713398445,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6731,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3970175003007231,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.5951,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.34878533658916844,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6761,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.35184500997088136,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6261,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3524872693988646,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6783,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3349970649879647,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.64,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.34561197102116836,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6634,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.34109755923127466,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6392,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.33306188510296875,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6143,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3236879938329974,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6788,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.34056717019759014,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6898,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4159978517020688,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.696,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.3364273904078727,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6492,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.34813484667820843,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6728,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3573723704774501,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6136,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.34399138714064675,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6621,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.3847135119237165,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6525,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.32113437688191543,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6312,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.3241685066678636,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6244,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3311499374935771,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6085,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.33061122626387485,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.65,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3446348528234487,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.643,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3573472270681029,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.703,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.35255691408742473,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.679,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3603893140466124,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6553,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.38622469068407206,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6372,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.36843820195201377,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6775,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.33775042010116374,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6271,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3503067124724184,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6354,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.36004712918213044,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6682,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.35939821897442975,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6332,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3618911935669541,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6593,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.33854288006772076,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6748,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.35168033639419904,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6682,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.3299805820327915,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6552,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.33869510493371086,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.69,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.34602955410892206,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6222,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3516653306140801,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6362,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3477412232753374,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7016,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3436682577532841,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6582,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.39319811292136175,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6778,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3209059243373577,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.5812,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3306336586741872,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6614,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.33213736652811277,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6687,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.42351662500126236,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6479,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3459766680283071,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7437,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.3389725535499289,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6837,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3325393445610534,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6647,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3386095660421953,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6233,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3370241727957124,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6342,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3535099702320059,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6553,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3740969238615925,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6769,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.33691852823168594,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6591,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3625396658837189,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6234,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.3625049876543691,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7037,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.35212752090111005,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6319,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.3625382805725208,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6591,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3456657296675217,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6479,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.36142618618576305,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.672,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.36438136146189665,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7145,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3476979241199332,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6801,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3490731264105313,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6975,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.35502946241086925,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6424,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.333526847372875,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6618,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.33168275158650823,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6287,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3612765677677876,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7079,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3524554525972209,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6378,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.369226952481241,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6805,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3452349944201978,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6882,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.34405536883787835,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6434,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.33682862695340515,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6393,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.32084200913186556,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6707,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3230728754454013,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.57,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.33415857941196947,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6381,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3325144894494768,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.5948,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3384324416507604,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6118,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.35864162638588704,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6658,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.35109161421023133,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6789,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3503657334157009,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6615,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.34176881292945643,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.559,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3695571222094076,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6665,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.35129314234943027,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6937,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3232157480221569,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6354,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4373189259156228,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6597,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.3536064139293133,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7012,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.35182109246364857,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7255,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3357727040183112,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6427,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.31551559137728413,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.5961,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3431897137939103,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.632,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.35622036759042697,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6459,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.423500818863888,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6537,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.32263410267189285,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6263,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.3493717459234465,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6189,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.36040957182808014,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6126,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.34005175160167944,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6298,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3353710359851425,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6603,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.35480473559934433,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6568,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3249120021096706,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.5979,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.35427464454731056,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.5887,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3410934853706257,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6249,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3441258786282159,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6573,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3403111049153056,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6313,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3118307375313091,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.5768,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.37470127605134,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.686,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.3392666553842881,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6376,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.33244221956218917,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6558,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.3381098234243685,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6015,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3414803363545892,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6172,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.3497723124625655,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6539,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.35103151983517905,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6257,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.34521209582923357,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6416,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.338129214111799,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6554,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3586447069828409,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6875,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3799063058634966,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.657,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.3525580555427139,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7051,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.34351459499403025,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.67,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.3505246349587142,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.617,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3166680583654929,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.633,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.34214341151187166,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6526,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.35997181508279635,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6656,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3401845314118498,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6755,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.342946579399248,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6585,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3250252010083524,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6016,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3310191288755577,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6444,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.35459213436813997,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6499,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.35371328706023036,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6763,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.36401419583261213,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6153,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3486439267338421,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6552,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3188377292964103,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.631,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3422508848615651,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6525,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.35044866925244333,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6661,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3281478236957656,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6337,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.3465012025306824,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6288,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.35684858536917496,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.651,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3329223140475106,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6393,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.34814608355260684,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6419,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.3684288530902952,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6602,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3234120709849068,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6599,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.33327145252980284,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6328,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.34282110962358386,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6125,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3520446592520151,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6465,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3377156357740252,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6605,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.3293023248874468,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.5864,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.33559194831506434,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6268,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3481052104875377,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6174,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.34302263073518136,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.686,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3546926275970501,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6928,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.40019803032777135,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7419,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3797909265050745,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.7091,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.33675971345197864,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6188,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3308856148788113,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6564,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3297676712198887,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6339,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3297539594308802,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6047,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.32656069639037466,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.65,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3357612955625864,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6497,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3134749601763479,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6415,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.33255251237148076,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6157,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.35401141781585654,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6918,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.339156042969735,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6424,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3597947315285747,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.683,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.35056167421065826,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6145,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3204723751052671,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6259,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.32690588734103426,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6326,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.37394346783213966,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6205,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.3851867117156391,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6652,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3570088341709913,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.652,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.3395319081531777,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6831,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3624681070079515,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6538,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.33344608217878446,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6288,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.35815663037601725,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6464,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.36000721164107724,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6759,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.335487015706093,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6464,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3187332361299941,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6336,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.34460908171875926,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6424,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3423571997875007,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.657,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3473702898460535,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6475,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.3392579613996751,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6181,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3339561694301029,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6335,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.35297172484859773,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6123,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3743646934730517,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6763,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.36032553764175485,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.5929,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.36040494901162906,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6309,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.33090877985851547,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6208,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3374944266272634,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6329,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.32948701734625285,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6219,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.35525609402854275,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6802,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.31854471346311747,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6242,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3483920540606934,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6565,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.3428486661429979,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6697,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3460855723552184,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6557,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.35322794593241996,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6526,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3332322804597539,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6287,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.33293602440066133,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6188,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3559051583590942,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6721,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.36086522563672596,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6489,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3741741572241904,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7021,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.3605126758683723,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6041,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.33587874752703967,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.631,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.34686099816372346,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6715,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3516087889717228,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6498,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.335259263859088,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6494,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.33264363464980434,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6259,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.3698507544225705,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6454,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3359232567315633,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.5946,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3443174796511564,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6722,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3344774015741628,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6335,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.33067467572081993,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6316,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3445448332902342,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6502,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3401402738516373,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6342,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.33063080662094113,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6423,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3299746965273354,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6291,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.34690133907895937,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6474,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.34108334966002213,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6407,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.34903671948512743,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6188,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.36401320838784496,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6513,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.33777563351469914,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6524,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.3330583430716245,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.5809,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3498181757801496,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6417,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.34818940049890484,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6884,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3558170390104319,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6722,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.35004208109334956,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6559,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.599282402226537,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6081,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.34569801459946947,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6661,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3498002867783941,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6973,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3291910091032132,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6124,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.34044113093668404,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6075,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.34323574405735774,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6366,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3502311037032876,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6547,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.3730856320600065,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6321,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.36199246020923154,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.5857,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.353380416231102,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6817,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.326954512921091,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6183,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.35207403639398227,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6973,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3509588481398452,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6838,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3351419559207814,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6273,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3463415954355351,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.625,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.43454340967775285,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5852,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.33830860580191335,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6344,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3330808465283843,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6068,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3399037356615535,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6457,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.3733580315038423,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6802,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.34045622189058755,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6386,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.36430301422773936,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6331,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.37114813941906755,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6566,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.34414233428813773,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6393,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.35187345171016765,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.631,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3274001090589902,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.586,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.34181628859730867,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6695,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.32708688446053996,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.5799,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3473133775182582,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6162,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.34794400742509624,
+      "learning_rate": 0.0,
+      "loss": 0.6736,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 553635297230848.0,
+      "train_loss": 0.7008882174491883,
+      "train_runtime": 9922.821,
+      "train_samples_per_second": 1.008,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 553635297230848.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7967aa0ef727f20681a3395643e02bdd696cb8dc
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "down_proj",
+    "k_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d35db844c598e91ddf65fb51ff585e542df8fea9
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26afb8ecb361aa1573c588e61fa343d711716fbcb5c105e69dd7c57f09063e32
+size 671150064
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ed1f31cc0342017c763b31b33f447ff58af33c54
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aff4131c4b93cdf643c1be464f218e02ad4cf80233999ed855e74ca85ce81f13
+size 918507402
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..36d3856ae417315c93e3b6ead96fce1bf8a77ede
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.174852192286667,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.5412,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.2419123477662346,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.5452,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.3059811183984338,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.5984,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.095206797753618,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.51,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.975387169235589,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.4603,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8843993721042885,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3699,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 1.0351394073465081,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.2938,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.0020984757485565,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1656,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 1.1739296273942252,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.0653,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8903900013638182,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9677,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.9449069404528453,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.0258,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8932747533831232,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.9059,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8591901163930206,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 0.916,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.760837120573045,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9499,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.6333378357605206,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.8212,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.5351536277597265,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9195,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.5815984102342769,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.9489,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5323214203271259,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8855,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.5271911019078436,
+      "learning_rate": 0.0001,
+      "loss": 0.8142,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5064885518974941,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9364,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.5045995751188714,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.8528,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.4838134325203139,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8881,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.4842193651138658,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.872,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5193281225894892,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8481,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5067542173200873,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.781,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.47295936190501514,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8296,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.5492365639189041,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.8508,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.48167656965140465,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8196,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.4534346171238379,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.7851,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4561415536028462,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8299,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.466156430217683,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8252,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4511530570176387,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8315,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.4670234354410688,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.8015,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.42188075569209216,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8158,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.43018547127689205,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8703,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4167689351885316,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8155,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.4392867612695143,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.779,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.42312975726573043,
+      "learning_rate": 0.0002,
+      "loss": 0.8139,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.41603337108686156,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.8569,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4701428337634891,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.7531,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.4169677817719275,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.7875,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.423915091323005,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8048,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.41304493098960177,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.7742,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.4414232950779636,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7888,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.4394849114380294,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.7749,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.43304201360654093,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8211,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.4147928959811065,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.7894,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5265522462227644,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8382,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.4484829312606615,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.7796,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.42783348522362324,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.7772,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.42289209845467834,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.7503,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.45852588594005317,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.847,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.4404745544017252,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.7578,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4163731261657778,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7804,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.419024020190218,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.7907,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.41029265779192,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.7869,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.4148072984657268,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.7516,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.4419314618936028,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7754,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.4245687861270358,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.8123,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.413959613692514,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8094,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.4107368970313985,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.7805,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.40851121731308915,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.7457,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.4230745779297569,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.8077,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3986868445295381,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7979,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.41312211726046255,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.8196,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.41224771104768804,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.4235560132752872,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.734,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4220942153235023,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7894,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.42418197692776344,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.7513,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4353473129092401,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7628,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.4070142671565011,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.7845,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.374428315728352,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.6753,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.41260021209545916,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.8008,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4240938972210413,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8001,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.4739076786256837,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.8006,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4274506824751054,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8105,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.4330605147869637,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.757,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.41833833467992704,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7513,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.43108864976438627,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.7463,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.3970898529137441,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7933,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.40354930148793755,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.7691,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.3996396037171106,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7541,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.416669802914507,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.7207,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4360323522039162,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7964,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.4258524108241521,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.8468,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.46468479366071264,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7766,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.3926015424233146,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.7032,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.44477391483356604,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.766,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.422480524852823,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.7825,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.45110875067089323,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7853,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.4060988959276219,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.7873,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.41718451058971323,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8195,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.3893544036303535,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.7577,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.3977928617341849,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7278,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.4009331041880101,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.8254,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4483565727456726,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7527,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.40841215504099687,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.7083,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.39252603106546685,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.732,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.4025898246464496,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.7381,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.38995013220861396,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7239,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.36744758611250783,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.6995,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.3993807376912834,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7204,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.3852929627024007,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.7149,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.37795734304575523,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.6888,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.4056272767803145,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.7184,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.43095451759319586,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7887,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.3990843159715983,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.6993,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5488510128773152,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7816,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.4061949940285652,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.6864,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4108354732903804,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7521,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.4105779612703396,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.7676,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4090672628646142,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7462,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.3889721654833078,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.7223,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.40864142586658264,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.7601,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.3861095237897938,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.7642,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.40820672730260193,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7227,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.48574711337691706,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.7971,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4116425695467719,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7327,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.4603067786671372,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.7477,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.40760458191649707,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7227,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.38622296406754275,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.7838,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4311793566422138,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.7686,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.4001412773975259,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.7193,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.40234722016688085,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7245,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.40717966576780684,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.7707,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.41601902466245766,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7693,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.4618987997508002,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.7965,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4003184333259143,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.6668,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.544704002941789,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7099,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.40018693660363797,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7251,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.373424152652994,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.6499,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.37934430650594364,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.667,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.42005136783693797,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.7462,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.388614133162938,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7261,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.39517853074521264,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.7397,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.41084214200497754,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7633,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.37553037057766164,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.7362,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.36072262854433307,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.6731,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.39675875599537863,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.7552,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.38275996674752444,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7351,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.40281565207472164,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.6884,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.3800398937956588,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7649,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.37818955959756634,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.7439,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.38940134231115464,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7644,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.37094580145774664,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.6911,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.3843275373661643,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7342,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.37820878684303777,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.7212,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3762651664537602,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7271,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.3906758138124606,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.7539,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.3950841107355177,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7658,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.41386345666241126,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.7738,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3944209165179543,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.6852,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.40443701216227507,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.764,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4053836691434719,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7958,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.38968536544727367,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.6706,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4221878091531115,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7524,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.37726503599066874,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.7492,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.37329107131334766,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7132,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.40330351890304966,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.7367,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.38379928388751416,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7165,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.3754901811558422,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.763,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4102122892411287,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7964,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.40045070158740387,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.7444,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3904879379908833,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7361,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.3901576587422018,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.7447,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.42698701509255066,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7129,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.40973290680547114,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.7629,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.42713400703510906,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7244,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.40000493907484663,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.7246,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.407066195754339,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7265,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.41535699810035354,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7627,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.3624646836729746,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7513,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.39284941439854026,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.7494,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.3884196538662921,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.678,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.37265254992120467,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.6806,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4275943898238247,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7078,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.4065266567259306,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.7465,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.3990288724103298,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.6653,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.3940172504770751,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.7101,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3788687571876394,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7772,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.4312036206527422,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.7452,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.43294716581488774,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7133,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.3759136819153871,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.6569,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.43297690007372713,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7342,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.40682691868312687,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.7374,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.39341849343313623,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7191,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.4009073956965113,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.7521,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3909729045474597,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7695,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.4761266766104909,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.7076,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.36601341885699573,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7155,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.3737945505162673,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.6854,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.40564425939183085,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.6718,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.41992512451788194,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7427,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.3797669077452734,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7074,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.3821815634843998,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7322,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.39209848345860876,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7087,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.3673075795060958,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.6942,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.40762391950517585,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7172,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.47168046809017583,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7269,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3673248032142168,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.6921,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.40793413927631295,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.7065,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.39571471829755545,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7011,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.47734958206069844,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7222,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3959188333021286,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.728,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.41699901529985056,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.6943,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.3735553986044611,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7062,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.41784251839692316,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.7848,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.38059219219418583,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7059,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.3624543403043051,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.735,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.8872087505840575,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7237,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.3781844792195068,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.781,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3618561686453649,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.6432,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.3771379156643068,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.7163,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.3784585866949077,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.6731,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.3662423072781949,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.6866,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.38826740426565903,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7774,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.3932066536646525,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.7286,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.36997603460335343,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.6523,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.3918279402399443,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.7047,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.39326362429181955,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.75,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.43585572564626895,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.7153,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.393579939630381,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7493,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.37447374558504265,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.6896,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3915094461951344,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.6808,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.3773715484147197,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.7599,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.3994812555017749,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7287,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.36481961860846446,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.6859,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3670019666608475,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.6722,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.3903432748130376,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.7106,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.39401899871180873,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.701,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.3753278029930899,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.6691,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.40961898544625003,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7085,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.4002125511109632,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.7739,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.37394434731730075,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7413,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.3675891639914627,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.711,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3721632182237883,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7038,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.3842121632076663,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.7093,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.39132717878114504,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7617,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.40541838904954997,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.7113,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.40580848314374224,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7234,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.3788393820883769,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.7089,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.40589206564731684,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7425,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.38617912332436866,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7176,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3895654787912917,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7395,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.3775653943994879,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.7397,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.3969607038613283,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7164,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.37801038885076754,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.6721,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.409233055247135,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7397,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.3846321370323925,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.6769,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.3836594402578531,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.73,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.3698135875805448,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.7178,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3800110014298568,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.729,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.3824003452606007,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.6834,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.3819552032844194,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7049,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.38025311113168603,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.6839,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4074890859906348,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.3869044169593943,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.719,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.3997831414626095,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7497,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.3798744672159815,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.7397,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4315916270774472,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7412,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.3481358218167205,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.6752,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.36423243940453054,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.6756,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.3932085351565385,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.7467,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3669609029970178,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7016,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.38172853932274353,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.687,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.391962458393,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.6936,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.38628359351249747,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.7466,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.3919425169241573,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.6973,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.37985313895545575,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.6682,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.41308411411507867,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7446,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.36923000783816107,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.6944,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.38912255752294606,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.6824,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.37516157859785415,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.6835,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.36077449717955606,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.6689,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.36762261756990633,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.7054,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3661342722722111,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.715,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.38269955506386627,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.7714,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.3688490506054277,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7426,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.3873509578960858,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.7516,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3761794380044784,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7079,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.40056581928299334,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.695,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.396306174344962,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7512,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.36911161649464735,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.7422,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.37875624220379006,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7027,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.38206572357858626,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.7182,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.44087069157672043,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.712,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.39564240353912855,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7011,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.37477423593554254,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7007,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.38532593342694543,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.6883,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.36152163663724846,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.6788,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.4099338561938528,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.7382,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.36593122698903263,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.6662,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.3718410704563449,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.6723,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.38015563138293135,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.6681,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.370824964904487,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.6981,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.37190425522333226,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.6915,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.4134502690104631,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.6933,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.3958332078622072,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7158,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.3877104741157981,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.6979,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3801598621787254,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.6658,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.37172870557646764,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.7234,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.3869693539717847,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7424,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.39380608987801785,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7212,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3836380194679643,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7015,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.38294391568201475,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.6728,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3684407971034021,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7276,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.36782914865474786,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.6935,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3547005180706965,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.6801,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.37358925011534083,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.7104,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.3691174540695535,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.6772,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.3810745954314981,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.7494,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.37714165675070244,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7016,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.3566572512605275,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.664,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.37756734385677954,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7196,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.38629987639451213,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.669,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.36129876085632506,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.6862,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.36290271691213977,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.6881,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.36828837884499055,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.6885,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.3947123381892617,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.6739,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3710342070647339,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.6682,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.3568277558353811,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.6023,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.40096883734102273,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.6938,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.39366913430025,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.7283,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3899719501341565,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.6953,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.3811060149312132,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.6923,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.3877184901266767,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7181,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.37570487164894645,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.6903,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3990768334325875,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7045,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.36468017035100203,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.7439,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3709488792010069,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6447,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.3855312196505767,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.6968,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.37081730330401114,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7141,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.39710496855717275,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.7059,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4175724196477424,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.6963,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.3842215052177171,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.7422,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.34944285190348157,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7044,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.36351526740228046,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.699,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.36884647531457104,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.6755,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.35857535515859895,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.6806,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3923666857697795,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.6936,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.38849877071620453,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.7046,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.3584263264067099,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.6597,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.36138589740234983,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.6663,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.38527180068415556,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.6825,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.40412844856312724,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.6563,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.37401398327720353,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.6698,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.40515318586757254,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.7499,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.35146950749093353,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.647,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.4063808336460343,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.7205,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.38644229466527624,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7077,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.3420681264350092,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.6289,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.38308061028907253,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.6857,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.3674045490160201,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7407,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.36874718203802986,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7133,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.373818222715427,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.6957,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3762865907231924,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7396,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.36300277570336514,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.6652,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.3904917705539183,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.684,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.37199677823710336,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.6951,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3985541089788225,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.747,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.3747357313142051,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.6941,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.37858108431847404,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7125,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.35862584195531766,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.6779,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3429005769754215,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.6994,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.37876750251492375,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.6804,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.3468960179430786,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.6676,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.3831747796081473,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.6808,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.37841376785544,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.6752,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.4279316248084326,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.7185,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.37601604528298943,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6702,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.3505774673496094,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.6946,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3664877863818731,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6836,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.38420617090256015,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.6758,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.36007075289624113,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.6639,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.35828089436110433,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.6958,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3771595610691394,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7302,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.35494018975711594,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.6509,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.3557291271239453,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.6866,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.36194769111631925,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.6794,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.36763967338423365,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.6702,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.34966585380789356,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7094,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.3514193489828989,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.6858,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.39152681449741006,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7274,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.35755785523271066,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.6865,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.3614439044420442,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7082,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.358590942979874,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7045,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.39159248919640544,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.7603,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3712266427733067,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.6877,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.3864394812212985,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.7502,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.37697745012714656,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.6384,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.38798423687896516,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7353,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3595948778744978,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7087,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.37010313959677527,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7016,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.35207553195728936,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6507,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.3415757077970016,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.7226,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.36246577422721343,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6889,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.3874584927068638,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.692,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.35050336327947873,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.6844,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.379138324147785,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.6694,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.35029682023932,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6303,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.36710781427868844,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.6583,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.3869140830738328,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.6907,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.375016132656621,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.752,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.34846735999551737,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.6605,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.36763862476695747,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.6912,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.36866573941258934,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.6729,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.35520848274255595,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.6989,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3465772132574022,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6339,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.3725783884243975,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.7124,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.37677205954801835,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7082,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.3781493059820502,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.7119,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3758004699076751,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6938,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.3614628036787356,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.6454,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3803283009276044,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7137,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.356995580870094,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.6862,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.37156355892036974,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7002,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.39892124229473647,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.6629,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.35627794947594865,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.6471,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.3729559884121915,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.6675,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3898626072425355,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7193,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.3873226189997151,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.7032,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.3626940426569876,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.6807,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.6811790858918634,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.6627,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.36326657900745496,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7167,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.3437354521778118,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.6352,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.3528095739021646,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.6894,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.33823450009582723,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.664,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.33825292026949455,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.6398,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.35227441495536316,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.6893,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.3491554261726527,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.6672,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.3557268832514482,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.6671,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3606105281950049,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6855,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.3592704005306545,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.6681,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.3526739493236099,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.6997,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.3392310064970902,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.6656,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.33948021180080706,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.681,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.3567394688296086,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.65,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3446383475635426,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.6621,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.3652029111338738,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.6742,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3453361682114829,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6584,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.3470509010770301,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.6401,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.39355894619844295,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.709,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.3505865283735694,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.6681,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.36355352325231827,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.6804,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.3706176316524146,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.6431,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3572688796259224,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6244,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.36607041853928607,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.725,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.36028601904932567,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.6983,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.3521866298600251,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.731,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.3525632875009428,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6906,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.349145508761633,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.6661,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3692327518479248,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6706,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.3394289327028457,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.6582,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.38785992118789714,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7117,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.35986869847962144,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.7225,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3548413026743914,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6911,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.3879131571635838,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.726,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.3836466458319732,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.668,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.35093440345017185,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.6224,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.37107833192612777,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.6344,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.35404640899323836,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.6652,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3617031028209322,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6717,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.35520372468151784,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7378,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3593723764750375,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.674,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.36271685238720835,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7051,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.38835589406468907,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6903,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.3450666933540303,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.6484,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.37833508715281694,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6871,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.3664373247024666,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.6563,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.3756177451926007,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7033,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.3590059343277386,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.6728,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3670433190964126,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.724,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.3787190599554793,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.6205,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.3454497682512466,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.6565,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.34704461844006534,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.6776,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3803751866284716,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6574,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.3576266744878013,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.6592,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3522343869759249,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6707,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.35211080690880675,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.6496,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3947310834775007,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7246,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.40152009995972676,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.659,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3749105234224021,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7107,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.35963198305871635,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.6456,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.36004407590129617,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.6849,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.34256206446359866,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.6883,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.3553528334119219,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.6635,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.3697495673554272,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7297,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.36124751312813597,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.6949,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.3938585615178389,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.6753,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.34933636975948074,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.6783,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.3401386033764282,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.646,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3276439157499567,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.5926,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.3648292475835564,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.6481,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.35712190056947224,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.6694,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.337986123079369,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.7206,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3636824004852608,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.6833,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.34534566926240323,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.6617,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.3683676689863593,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6952,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.3754604071614481,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.6904,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3502870151094157,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7077,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.3532161453414018,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.6867,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.34216842169703193,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6958,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.3452175475894138,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.6905,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3960344690681105,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.6733,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.3521931246522434,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.6811,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4052583162933025,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.6317,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.3560063804754956,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.6463,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3667689282680826,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6769,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.38219062338204474,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.6563,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.351086490437197,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.649,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.3626946330009153,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.6553,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3555684128972453,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.6511,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.35713740915573505,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.6488,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.36969172915692416,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6076,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.35557894666896,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.6782,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3770997160342275,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6463,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.34799230460154973,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.6537,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.34417676553483495,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.6101,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.36437771839881833,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.6535,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.35862352295980277,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.6756,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.3699165408543771,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.7167,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.34655546549114363,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.6545,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.37727583792959807,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.6446,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3546180541237899,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.6821,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.3471439526432889,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.7216,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.3421929753393047,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.634,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.3568553020749221,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.6224,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3469058900844622,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.6822,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.365114132455481,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.6733,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.36117504088003655,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6676,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.35183717817106164,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.6772,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.33967313753393985,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7049,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.3756110009159534,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.6906,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3567481356987659,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6477,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.34587374165715784,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.7119,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3547315822290638,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6496,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.35068723175307986,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.6594,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.34204285531613576,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6592,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.3472665172059884,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.676,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.33023294700263917,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.5958,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.3549980428582043,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.6441,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.37089522030188665,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7005,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.3835797613220501,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.6497,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3560284521440167,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6315,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.36053243792495826,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.6169,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.36357053520621047,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6751,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.383585475808702,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.6828,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3763923557862577,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.692,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.3583680537989247,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.6529,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.357971945414798,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6242,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.3675974232417326,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.6667,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3753348705839958,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6567,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.35296107724075343,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.6744,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.35737068044442893,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7119,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.3742031428545619,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.6701,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.35912721431704026,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6314,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.3546955760845967,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.6564,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.34330325859647054,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7046,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.3842046843569431,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7085,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3550618456638281,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6484,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.3508035800123553,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.6555,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3424921420125426,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6466,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.32771932502111106,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.6371,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.36329217258802543,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.6793,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.33510791033413506,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.6321,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3511162120492735,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.686,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.3747552930644442,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.6156,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.36780445309988385,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6229,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.40200107534286134,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.6955,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.37411746459882256,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6482,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.34333223943737057,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.6667,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3605601968276506,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6582,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.3705587684406122,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.716,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.34145327558000876,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6518,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.3428159808012652,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.6287,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3735476142133528,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6166,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.36088630739120264,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.6409,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.336805561573356,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6051,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.3564740780513103,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.693,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.36842908620913045,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.6431,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.3452755158126447,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.6389,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.357204753971374,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.6614,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.3555927630992336,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.6506,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.35071779715989215,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6773,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.3468587325939666,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.6899,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.36870305727725394,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.6622,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.3563200011548181,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.7039,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.35243606935919075,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.601,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.3889294072436819,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.673,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.33968815595714036,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.6692,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.3442811253045003,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.6491,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.35931484952336434,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6255,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.33267277569085585,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.6222,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3795423975740203,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6546,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.36130273787387834,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.6542,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.33368631687639777,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.663,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.356016044191631,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.6029,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.33437157623080216,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6009,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.36929028277298886,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.6923,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.48648035436777626,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6236,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.37050028429307086,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7212,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3717500208488645,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6674,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.37039844873150024,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7207,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3211782696393604,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6373,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.3259793164203849,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.618,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.3360116751173385,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6427,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.3334069590638098,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.6427,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.35653982289760483,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6511,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.3356635843682699,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.6108,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.33475695126644883,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6468,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.364022566933514,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.644,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3411126482433266,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6793,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.34598234849443915,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.679,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.33115184965176603,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6081,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.34689895068402693,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.6728,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.33426322053595414,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6263,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.3908589045319285,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.6811,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.37086890844371845,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6791,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.352481399418626,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.6035,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3556905909409883,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.655,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.323278647915081,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.6104,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3543930190777003,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6571,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.3768691858208267,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.6995,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.35385766440035066,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6742,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.3371564161334034,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.6972,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.33872615023532215,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6888,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.3619087379833558,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.6515,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3354498087205006,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6546,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.36092006804188176,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.6529,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.37397378593728353,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6434,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.34307293730008304,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.6401,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3418056541944195,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.633,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.3856471386919663,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.6639,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3622342192250039,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6282,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.33339053534505564,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.6412,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3381423744397256,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6223,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.34388080644268326,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.6431,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3417840426086896,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6437,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.35310851098361445,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.6559,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3434450564374458,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6134,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.3718283430838848,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.6588,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.32338874943023094,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6285,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.33231452773322584,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.5941,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3513172583604685,
+      "learning_rate": 0.0001,
+      "loss": 0.6744,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.3460857789690904,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.6523,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3602769991596556,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6092,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.37105715522506105,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.7126,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.36634203067739235,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6457,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.3488670432393145,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.6647,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3680897825669399,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6412,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.5043876017186315,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.6728,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3610104478993515,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6492,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.34717150933690816,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.6456,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.332565817334399,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.634,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.35876030069682097,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.6696,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3448074874893681,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6233,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.33930716718963655,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.6798,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.35969300240814994,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7062,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.3640424890367901,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.6479,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3465541176904211,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6768,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.336377209847485,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.597,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.35720032678174196,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6569,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.36147187421948807,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.6572,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.36456812813376643,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6269,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.34238113344853044,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.6486,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3538460544846853,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.5996,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.3381284231312074,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.6357,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.35986582214349305,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6764,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.323563052308323,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.5963,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3956312370915418,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6905,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.3397007639140885,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.6332,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.32247734926631433,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.6166,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.33475964650166706,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.6024,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.35529200114794895,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6809,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.40786172554139966,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.7048,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3468015636323706,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6035,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.35249258081975404,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.6213,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.4021936975423688,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6314,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.4553860959058501,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.6215,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3790640863147156,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6616,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.3708524896435191,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.6457,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3475291898433243,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6591,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.3797371340902428,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.7092,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.36574583109343267,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6362,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.34289876837011185,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.6396,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.338568685704138,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6506,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.3235712908871543,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.6439,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.33715700433179935,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6221,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.33400788687652666,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.615,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4266829104465876,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6346,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.3430245374866473,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.6141,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.36185558574035265,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6729,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.37420051018977957,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.635,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.33276192449027164,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6213,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.3759493424663521,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.6793,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.35066054482886627,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6812,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.8357492304681667,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.6549,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.35288247489819874,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6189,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.3328609388255515,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.632,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3410948239423367,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6334,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.3753989457758862,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.637,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.359743509245254,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7027,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.32967241090714533,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.6337,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3148349462575111,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.5859,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.34726818294504275,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6949,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3502795284328535,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6685,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.3573854083478653,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.6171,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3251675669449455,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.5611,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.34313457651696666,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.5764,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.34556748889259625,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6561,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.3275366659802456,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.6266,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3357722741486548,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6924,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.33507644767103867,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.6281,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3696365488761149,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6706,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.34590966873605616,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.6089,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3472035646814677,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.705,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.350747720690762,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.6231,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.33590050967476426,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6588,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.349347905388431,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.6549,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.33807568727676013,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6686,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.3370565633589211,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.5958,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4735862669632568,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.668,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.33837996550694993,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.6308,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3392732986146249,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6761,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.3358765639072988,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.6178,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.33173102492725864,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6019,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.35658256031807223,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.6705,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3748455544598971,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.615,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.32165526070762285,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.6348,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3368828998645065,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6455,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.3397827712490149,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.6607,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.36276971696566507,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6662,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.3435139781227591,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.6271,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.35148211886992414,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6304,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.34153053729256644,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.6428,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.33771166105394973,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6694,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.3381109664449864,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.6126,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3322619750857264,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.63,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.3411227947303166,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.6276,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3311599693102653,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6116,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.3618162118125905,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.675,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3448886900635197,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.5876,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.3815086820828657,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.7153,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3424972965832173,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6105,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.3438098859462676,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.6447,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.33902666734712233,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6163,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.3314464176008185,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.6111,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3492886708213938,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6544,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.3431716850924754,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.611,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3662298831016498,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6386,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.34431760904769776,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.5973,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3330871036206359,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6317,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.33922286635680166,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.6581,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.3714215187715549,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6471,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.3315286563306679,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.5955,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.35981870840392294,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6558,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.34551284889004336,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.6483,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3423659385547448,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6457,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.3664967859517354,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6665,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3335290747742488,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.5645,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.3494881138806042,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.6391,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.33656974163543923,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6303,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.34539266418049264,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.6368,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.34374647515567053,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.623,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.36021041560458406,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.6643,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.3476822052034766,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6675,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.3396315149281206,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.5738,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3372436009028793,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6379,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.3328355003941503,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.6326,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.35791987496306515,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.654,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.34334518968094335,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.5783,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3368095032667133,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.673,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.3537145192474511,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.6625,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.3531996916980002,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.5923,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.3612731220254524,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.6194,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3886557385929825,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6137,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.3544485173162249,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6692,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.5096621614352511,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6197,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.3509486989832222,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.6686,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3572483650291914,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6273,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.34366028857416675,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.6459,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.3510230058828406,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6559,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.3495066634770411,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.6165,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.33332936474310854,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.5952,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.328832402223803,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.661,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.33607601799098924,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6495,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3423937706972676,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6274,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.38289286460997496,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6738,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.3338479271655618,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.6613,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.328643411646018,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6375,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.35086440716031086,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.6083,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.336518880389029,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6488,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.3426882198313159,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.621,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.345716170858981,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6358,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.36706291182679185,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.6447,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3387527465960079,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6243,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.342881537155457,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.6221,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.338695743727742,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.5956,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.3601693441226773,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.6193,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.33716618376042284,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.5791,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.32128132942983445,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.5719,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.3749884944695895,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.5817,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.33564233650998176,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.6281,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3424352082166732,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6543,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.3509018285412802,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.619,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.35934568187673543,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.5966,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.3431163978046936,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.6364,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.34625798672579355,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6361,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.32443600753641594,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.6356,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.34459251472351493,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6048,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.3345412259745322,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.6438,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3260165546056915,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6431,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.3330174029964389,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.5977,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.33501579672416454,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.5632,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.35228382275397585,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.6751,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.35943790003209114,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6123,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.33798390186355776,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.6397,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.33046298698438736,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6365,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.32574852800773374,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.6033,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3216753676493232,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6385,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.3299670892424923,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6213,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.3125119039605451,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.5912,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.3797141126460133,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6551,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3370210754898945,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6043,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.3478736493741818,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.6374,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3595153093595184,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6186,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.36047353803967713,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.6221,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.33675550469388493,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6202,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.34797239104232763,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.6529,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.33924394710023,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6134,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.3828287262102965,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.6054,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.38173112009274013,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6432,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.3331270379892713,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.6187,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3386203032989208,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6411,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.3240975883763448,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.5982,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3199483145072002,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6465,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.3459437390242868,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.6086,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.34244123635720497,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6422,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.34609204729722776,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.6048,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.36228965918385303,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6071,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.331923451965058,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.5907,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.37386608339506866,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.647,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.3290981600797574,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.5752,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.356801104340884,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6042,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.32682734025038185,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.6136,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.34255695950760445,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6837,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.356159265019211,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.5955,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.35671005269805617,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.5766,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.35753987551620403,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.6295,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.34154223402161743,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.599,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.34068669433515647,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.6481,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3163105027104073,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.5894,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.3728940924971478,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.6313,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.3323470866292376,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6051,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.3422294361915459,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.6409,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3581679930388663,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6063,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.33177641577322436,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.5722,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.3341678200125972,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6048,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.34913269384188267,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.6038,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.34210468444628106,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6287,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.31210895546892026,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.5884,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.35219449590912993,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.5874,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.3349621825934785,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.6063,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3252780110735412,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6325,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.33810475867529366,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6134,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.35824268924366476,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6086,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.35718567477435437,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.6394,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3506010415131454,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.624,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.35778490743392094,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.6845,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.33433788575913326,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.5847,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.3466744954167082,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.6432,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3489421862068939,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.5872,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.33117493924801256,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6442,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.3348837449633641,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.5787,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.3339803491928183,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.6456,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.326373852545579,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.5956,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.386365828267358,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.648,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.3399141661816139,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.5771,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.3599625251464077,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.5987,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3289470636524019,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.5911,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.3229463772774697,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.5997,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3829602596620701,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6462,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.33426423483585777,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6504,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3536087449960244,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6745,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.329713415930323,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.6567,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.339402553173935,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6348,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.3537464292423974,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.609,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.34196305631334595,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6142,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.3440748200033215,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6214,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3384041540200196,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6364,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.3532800876899815,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.5858,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.33383750428349107,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6363,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.326654564926912,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.6071,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.37644944556356774,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.691,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.3505956435013439,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.5953,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.33658269804132707,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.5956,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.3366854732590486,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.5764,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3466033210511593,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6589,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.38021202329505166,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.6101,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3347747636682198,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.596,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.34764008560357174,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.6236,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.34635126520329873,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.5961,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.45852499489645104,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.6135,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.47300754187876,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6055,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.3336196301153384,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.5842,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3856310871637318,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6631,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.346311090865471,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.6542,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3347814958414131,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6127,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.3327824426233396,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.6036,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3284600973420504,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6202,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.36923160165027574,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.6601,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3527301872492391,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6607,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.3425200973681726,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6242,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3568470629966751,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6436,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.34147814071727495,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.6151,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.328333809265017,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.5981,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.3382395567060543,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.645,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.33331968460229827,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6053,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.34057869855137035,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.5948,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3290546163013816,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.5952,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.3359249985513373,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.6254,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3251069976287337,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.5989,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.3514172715827471,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6454,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3243574493629817,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.5848,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.32423564691983164,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.5984,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3349320085842291,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.5493,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.35507378645526905,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.5782,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.31346565845057295,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.5792,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.339190510820282,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.6312,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.341907055391667,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6016,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.3255898643152782,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.5567,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.34848022717459426,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6055,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.34249415602960825,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.626,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.31351750195913475,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.5773,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.3261853813612115,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.5937,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3550362507161625,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6176,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.3347554401991842,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.6118,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.35567840871262907,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6469,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.3559769207425711,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.6279,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3622260889023372,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.5957,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.34546558486966733,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.6383,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.3414376189445506,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6204,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.34334137334535764,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.5722,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3357756940991449,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6246,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.33452702182387545,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.622,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.35678047137193136,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.5947,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.34522705825177974,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.6169,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.328482844180868,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6065,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.352773092897859,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.5957,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3430681903679473,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6013,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.3509634075522109,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.6274,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3643192826914098,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.666,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.34819866786226594,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.6293,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.33836509096864215,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6602,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.3377560200675623,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.6469,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.34701822295393986,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6237,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.3496507449517045,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.5974,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.3518870482920689,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.5863,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.3386060344373831,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6339,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.33056597036296287,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.5947,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.35315143510157854,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.669,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.36678405493630256,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6602,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.35496756095058585,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6109,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.32484178473000924,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.5899,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.33623176711375374,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.5498,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.33047697672464993,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.5651,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.3379805406898622,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.6536,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.34348726637901394,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.5979,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.3369936237622732,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.6209,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3620126243026674,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.5937,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.3478287872629664,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.6635,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3610022987451473,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6256,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.3510684858259938,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6034,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.33229601293270916,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.5716,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.3407544768084185,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.5545,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.36019711156371026,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6634,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.3496451511939708,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6065,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.3651858232496374,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6387,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.35783748090888917,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.5921,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.32933507445357985,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6366,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.34552065441434493,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.5765,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.3414784912136309,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.5805,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.32676818861295753,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.6157,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.311466870667826,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.5735,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.3288806224466473,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.5358,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.33842663500282605,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.5862,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.33076301508735606,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.6268,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.34191118796049813,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6238,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.30470855878489794,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.5607,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3394770076542117,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6047,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.34752911910840706,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.5989,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3171861041373292,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5986,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.32370046708611994,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.6322,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3403886458850514,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.5712,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.33532744097004147,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.6644,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.33314721264059993,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6385,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.33888798280900234,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.6358,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.32184629951311494,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.5888,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.402043844910486,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.5472,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3268309670747305,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6136,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.3147358042653876,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.6145,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.33905985365123564,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.5749,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.32554821385971133,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.5442,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3705835264290631,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6363,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.34687464446586974,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.6419,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.37149323515615806,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6455,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.37767561793664783,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.6109,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3325337496275416,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6137,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.32366227573157247,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.6142,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3281643585543578,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6345,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.35630193957247364,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6526,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3680226392043987,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6138,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.33454525351931946,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.5325,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.3541194241867334,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6184,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.33032402475086314,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.5966,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.33889136649801577,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.5409,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.32944729310677895,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.6122,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.343369601719398,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6667,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.33471215942591,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.5985,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.34766327231907157,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.5905,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.34794447852113786,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.6679,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.35352688204869276,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.5774,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.3503894409025353,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.6049,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3419316201221051,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.5757,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.3474883270272006,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.6044,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.3400637312711317,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6258,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.3590234930612263,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.6039,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3347921945667021,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.5836,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.35418037934495206,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6275,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.35907806810817905,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6032,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.3416727500520991,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.5893,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.34937663601271246,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6262,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.3468347223251416,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.596,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4663420462566947,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.5598,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.3478299882572493,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.6037,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3491371976190531,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.578,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.3224168497203568,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.5707,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3497842183284705,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.628,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.3637221018117844,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6261,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3497090138868019,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.5972,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.3424617829976763,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6237,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3472679049611877,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.5898,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.38382084399980854,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6509,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.35818320452547475,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6183,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.3280855359615322,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.5884,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.34167181642304734,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6063,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.3581565635204731,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.5884,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.40457865558246575,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.573,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.3396694616042361,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.6134,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3288631131124086,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.565,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.3522850767680027,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6208,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3496630890644889,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6478,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.34521354334546384,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.5936,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.32916214591040255,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.5804,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.32492396630451986,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.5927,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3275153847995364,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6251,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.3528759656094755,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.5801,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.367972580363633,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6147,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.3587343173142828,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.5958,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3614376480967368,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.5897,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.3558733082244146,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6067,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.3576499918732122,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.617,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.32348481797534356,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.5924,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.35976954140226275,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6137,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.3367133292322927,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.585,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3666196371390034,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6458,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.35296555685607484,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.6428,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.36291384377744296,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6174,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.32944881339937504,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.5806,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3252347324995862,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.5723,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.3759298251455837,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.614,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.34350352854627375,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6355,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.3255554235034715,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.5923,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.33935822693399353,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.5757,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.35956652809566675,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.5827,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3464718277540499,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6438,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.33711044965199344,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.5891,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.3307753227099956,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.606,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.32072821421317327,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.5908,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3679805432721958,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6317,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.34951059101065973,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.5946,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.35348265534072415,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6028,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.34262473025738155,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.6453,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.339645286968352,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6071,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.33276544388234147,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.6231,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3381217622159822,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.569,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.32745384801860544,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.626,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.35174659838743455,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6391,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.3518656769447974,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.6335,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3705221607331985,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.546,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.33276061750399677,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.5994,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.35280552080454464,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6496,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.3204166724597419,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.5826,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.32988206814279836,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6048,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.3246960203505506,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.5601,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.339714136580042,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6614,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.3552916599335909,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6236,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3404478280633831,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6362,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.32239417587058217,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.6004,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.32041107412168346,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.5549,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.3346809999458499,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.5619,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.364466231133146,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6141,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.3317189002719038,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.6238,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3256609411268322,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5651,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.31734296267635187,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.6016,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3406610875494774,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5982,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.36246008387194356,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6351,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.33984162849695077,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6314,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.3275584466815877,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.5832,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3395780237628323,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6099,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.3443027185143187,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6134,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3221773032461762,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.5805,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.35206646392275887,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.6299,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.32833005369432117,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.5983,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.332205742625648,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.6267,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.34898125982747574,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.5582,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.31796847705272,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.5206,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.3641126830546926,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.5835,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.3270521049353336,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.569,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.35222406275247237,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6243,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.3289485047089139,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.5896,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.33274687560591176,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6124,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.3493330572049464,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6317,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3362064958501068,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.5753,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.3576621743241454,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.5804,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.33012366472913696,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.5788,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.347653986542375,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.5854,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3323912180374366,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.5885,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.38083319979832697,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.6529,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3426288486745291,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.652,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.3258227992254228,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.5823,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.34552782393746645,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6429,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.34585912369353516,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.5873,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.32788379221951186,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.626,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.3343646080672475,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.6098,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.355768636496455,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6236,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.3730126642330909,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.5936,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.31953139488473575,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.5555,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.3523701477101136,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.6754,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.36742759253540924,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6482,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.33968792210204607,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.6029,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.36371297347883375,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6594,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.3660127629756041,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.5838,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.35511370698314615,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6452,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.3363613444987827,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.5655,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.36322908286669636,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6346,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.33539501918083525,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.5852,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.37291280962010165,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6272,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.3361674696613345,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.5811,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.34984812754410244,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.615,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.46656006049123666,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.6002,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3370570438284523,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.5785,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.35279039394029527,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.6398,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.33664616276466686,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6285,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.3624270205714784,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.5839,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3457343293859614,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6187,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.3290788278898686,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.5759,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.35531106545655416,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6329,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.3492304272841695,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6074,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.33751463361052897,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.5959,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.34414667982486763,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.6766,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.34208711480428583,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.5846,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.3274222219532599,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.564,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.32352946950501243,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.5646,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.32646724699080115,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.6262,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.34292711834350403,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6225,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.3396645859144178,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.6387,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.35753988403286646,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.678,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.38303375416849994,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.6386,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.32437564943451874,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6297,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.33455678611417283,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.5864,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.35633372512862366,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.5792,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.3270029314567895,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.5935,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.424905218617759,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6181,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.3438103926353635,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.6379,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3484985442300672,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.5823,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.37070144803293215,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.7229,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3435688728484316,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.59,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.3392515140501043,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6108,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3380362142098612,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6424,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.3390725949829156,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.6334,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.34696472062215294,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.5874,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.3319619683863788,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.569,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.34664240273570046,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.5728,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.32850816834155266,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.5932,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.3637393330704097,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6328,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.32476356201295953,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.6157,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.36321274221856314,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6552,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.3454751537662377,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.6139,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.32388775087044114,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.562,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.3297851417597346,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.5993,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.32841215556161885,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.5903,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.36272395447568967,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.6059,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.8762168378974585,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6509,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.3184011726641648,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.594,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3473583567384825,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6342,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.3584881362293531,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.6197,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.32737124624942704,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6299,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.37869537163973865,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.6598,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3321138787804279,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.5605,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.3408301098945604,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.6054,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3615435875111645,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6177,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.34767742571856813,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.5915,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.327953339120735,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.5491,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.3300646854341275,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.6129,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.339220283137721,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6061,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.32327635395524157,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.6273,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3464923503762519,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.5959,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.33387843346355767,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.6092,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3366781877057486,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6642,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.36947007399686926,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6075,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3534595267305509,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.5763,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.32910943744054405,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.5804,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3113027212367898,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.5952,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.31503140780736955,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.565,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3199722417834563,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.5863,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.36333310408385744,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.5642,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.34357775418269904,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.562,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.3374720421675638,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.5757,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3240559607434544,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.586,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.3235001197601677,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.5964,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.32646149016401926,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6076,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.3433792228141088,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.6089,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3406389457841003,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.5476,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.3666160110041597,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.6208,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3339058118069488,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.605,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.31960892698928794,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.5785,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.35726561286788344,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.5882,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.34382423390343725,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.6186,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.33950063476087333,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6131,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.3374258237510089,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.601,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.32243906417604673,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5592,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.33738749681016067,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.6265,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.36964025638056597,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6118,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.33757112059065936,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.6032,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.31982456897221395,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5737,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.3369736515602363,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.5822,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.32918194943225737,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.578,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.3409591484904072,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6282,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3775219527797019,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.605,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.33899188697787547,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.5586,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.35085086397272225,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6007,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.38394310885156824,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.5863,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3307196575296069,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6356,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.33602057099272453,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.5993,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.3508656340887223,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6385,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.32674316969959266,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.5244,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3917784573175883,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5978,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.3415814546985931,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.5808,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.33900386574972013,
+      "learning_rate": 0.0,
+      "loss": 0.5779,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1113230063894528.0,
+      "train_loss": 0.6684961423873902,
+      "train_runtime": 19805.2254,
+      "train_samples_per_second": 1.01,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1113230063894528.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddab6017a1a1fb6922a0b0971ea863eb587ac2fc
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "o_proj",
+    "k_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5676b622f362e2aca7ec5e323372c92c042dd285
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8262ee6a0c00b8ce095b990a942055d72e79c6072afadd6521a5f42aef69fad
+size 671150064
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e8fb1a17d6759b87a3b29b357dbabd8927e1935c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc232555e409ab19e56350ef2c9b0c80c0b12c30a0291e5aaa3b3d52cdb8089e
+size 918507402
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..56daf9503a7c4fdf85b35199d042c2bd2782ce6b
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.1618730196957123,
+      "learning_rate": 5e-05,
+      "loss": 1.5515,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.15537504646185,
+      "learning_rate": 0.0001,
+      "loss": 1.5311,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.9635490214235893,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.4312,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.3893228481848556,
+      "learning_rate": 0.0002,
+      "loss": 1.167,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.3289968323340917,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.0447,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.8072806305690539,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0105,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.6177964607456953,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.8789,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5524814937302823,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.8635,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5319431137217482,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.9212,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5995041018941503,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.9419,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5241617758084439,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.8659,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5203540014161351,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.8887,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5144968235440169,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.9298,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.44732252083116936,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.8116,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.48923527048555243,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.9035,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4202366241305497,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.863,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4537252189692721,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.8725,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4306220050888526,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.8376,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.3894710263894399,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8287,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.43976119402396047,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.823,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.40179621147791533,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.7952,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4345187904250498,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.8617,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4131232137462944,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8384,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.40603865564196473,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8331,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.3951360345025102,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.7836,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.40477643152693216,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.7748,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4251533568833813,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.8444,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4071344808654217,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8003,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.408159311901172,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.8071,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.39865492737418323,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.7849,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4276310275653838,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.8444,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.434386133606189,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8215,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3822536210786355,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8132,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3913709039902287,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8282,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4171383919051642,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8489,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.38777521576935947,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.7557,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.38572156231558025,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.7754,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4104203019193881,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.7698,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.5677828834671339,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.7886,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4077418306551607,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8048,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.41489722264421747,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.7622,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.39615414916810177,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.7987,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.40912637858839207,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.7637,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.38793742176167395,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8198,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.3979430102052743,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.8026,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3788509235665202,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.8023,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.39278185642418834,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.7481,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3696223706213371,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.7609,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.39313607939291717,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8157,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3855055319878002,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.7638,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.37066834001446136,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.758,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4170209578873234,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.834,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.37700762349814604,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.7533,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3694537319289278,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.7159,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3764585488672252,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.7762,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.36568785296633305,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.7848,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3642739327906848,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8302,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.38223649800753023,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.7719,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.41018205417563863,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.7676,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.37477718222402323,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.7395,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.373178036681771,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.7662,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3869027594520346,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.801,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.3525699091995798,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.7239,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.37067545461002194,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.7011,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.37290965245827035,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7643,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.37698723891357505,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.7229,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.37310662528724425,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.7814,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.36161343594825424,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.7469,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.38820107691296035,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8049,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.380257947651291,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7446,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3532116216773163,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7377,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3856516769452259,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.7489,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.37631098630814735,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.7719,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.38056900801863475,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.713,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3475939820542006,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.7242,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.35273458966475485,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.748,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.3745939242868226,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7519,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3749958310986676,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.6751,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.346667180530484,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.6966,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3747288183174455,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.8034,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3533701171483392,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7251,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3591629687481598,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.7635,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4084692648623678,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.6802,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.38688968930898393,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.764,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.39312529151121633,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.8013,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3863181564669427,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.7498,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3799061879135913,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.7561,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3838739899745123,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.7573,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3530265455361142,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.7013,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.33230437953255176,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7152,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3833248244274872,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7238,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.39464827998307495,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.7773,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.36445590177391896,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.7877,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.357480514684254,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.6932,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.37544630400071793,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.7565,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.36203062045464296,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.6979,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3729928884427148,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.6918,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3649485582808177,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7278,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3908594146852727,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7761,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3642904141026686,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.6851,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.44262214768350755,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.6799,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.35993727801165104,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.7673,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.40347095599001326,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.716,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3845207910750551,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.7552,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.394463216691188,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.7964,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.348375773555577,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.7627,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.38679798921064223,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7596,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.34654567263972674,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.7239,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.35543614359220455,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.751,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.38143658550121157,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.7369,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.37067691582012285,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7399,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3488961142915352,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.7192,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.33944638409930467,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.6978,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3568746231972527,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.7851,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.41401609220830243,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7084,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.35282787153112793,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.7244,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3527796028448424,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.6861,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.5300463886375664,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.7752,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3994773326174539,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.6969,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3651699090120398,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.7376,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.34676126862325934,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.7347,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.37764716508598595,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7745,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.3564807247913989,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7641,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3742103364582621,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7504,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3985429773500903,
+      "learning_rate": 0.0,
+      "loss": 0.7313,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 114429774790656.0,
+      "train_loss": 0.7987350339889526,
+      "train_runtime": 1999.0989,
+      "train_samples_per_second": 1.0,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 114429774790656.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7327bc9e0975f5675f0b1cad99f0ca78114c285
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "v_proj",
+    "up_proj",
+    "gate_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3c6191004ca47f0d06e57ebee60afa0c4a9679b9
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9ef59283510c9e7b49e3932426b46eabb879faff3c8887c30de65c24077d5fd
+size 671150064
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d70a1447480e247ef0e5b0c3486d0b95f8fa6f2e
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3a09b73edd598b333e4c155070f11cbe643f9b38281bac20c220a35ec1c5993
+size 918507402
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..770135592cc16f102bef0cff872dcf4ad8777ffd
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,17542 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0004,
+      "grad_norm": 1.233553174704933,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": 1.5715,
+      "step": 1
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.3337982820383152,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 1.5787,
+      "step": 2
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 1.311564134607298,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.5487,
+      "step": 3
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.061096529351063,
+      "learning_rate": 1.0666666666666667e-05,
+      "loss": 1.5362,
+      "step": 4
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 1.2336840237271833,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.5033,
+      "step": 5
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.9783991334698707,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.4035,
+      "step": 6
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.9119634003290528,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 1.4177,
+      "step": 7
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9977006256744981,
+      "learning_rate": 2.1333333333333335e-05,
+      "loss": 1.321,
+      "step": 8
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 1.1542983904843214,
+      "learning_rate": 2.4e-05,
+      "loss": 1.239,
+      "step": 9
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.9682399854813285,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 1.2082,
+      "step": 10
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.9957686985180999,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 1.1093,
+      "step": 11
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 1.7801228600481995,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.0706,
+      "step": 12
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 0.8252284965178849,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 1.0651,
+      "step": 13
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8172977694645526,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 1.0237,
+      "step": 14
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 0.8846448752697776,
+      "learning_rate": 4e-05,
+      "loss": 0.9963,
+      "step": 15
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7617761740155385,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 0.9759,
+      "step": 16
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 0.683601200285156,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 0.8564,
+      "step": 17
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.7155019049565624,
+      "learning_rate": 4.8e-05,
+      "loss": 0.9246,
+      "step": 18
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 0.6216097253918912,
+      "learning_rate": 5.0666666666666674e-05,
+      "loss": 0.9475,
+      "step": 19
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6398971674842517,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 0.8955,
+      "step": 20
+    },
+    {
+      "epoch": 0.0084,
+      "grad_norm": 0.6220173239834369,
+      "learning_rate": 5.6000000000000006e-05,
+      "loss": 0.9069,
+      "step": 21
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.5676109594805903,
+      "learning_rate": 5.866666666666667e-05,
+      "loss": 0.8886,
+      "step": 22
+    },
+    {
+      "epoch": 0.0092,
+      "grad_norm": 0.5562866799273941,
+      "learning_rate": 6.133333333333334e-05,
+      "loss": 0.9011,
+      "step": 23
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6399877528249651,
+      "learning_rate": 6.400000000000001e-05,
+      "loss": 0.8165,
+      "step": 24
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.523177200801189,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8198,
+      "step": 25
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.49637418009567086,
+      "learning_rate": 6.933333333333334e-05,
+      "loss": 0.8107,
+      "step": 26
+    },
+    {
+      "epoch": 0.0108,
+      "grad_norm": 0.5471005048916006,
+      "learning_rate": 7.2e-05,
+      "loss": 0.86,
+      "step": 27
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5322454492355576,
+      "learning_rate": 7.466666666666667e-05,
+      "loss": 0.841,
+      "step": 28
+    },
+    {
+      "epoch": 0.0116,
+      "grad_norm": 0.48738685686042843,
+      "learning_rate": 7.733333333333333e-05,
+      "loss": 0.8626,
+      "step": 29
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.5305190850010086,
+      "learning_rate": 8e-05,
+      "loss": 0.9097,
+      "step": 30
+    },
+    {
+      "epoch": 0.0124,
+      "grad_norm": 0.47861290923048533,
+      "learning_rate": 8.266666666666667e-05,
+      "loss": 0.8681,
+      "step": 31
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.4740060284166036,
+      "learning_rate": 8.533333333333334e-05,
+      "loss": 0.8481,
+      "step": 32
+    },
+    {
+      "epoch": 0.0132,
+      "grad_norm": 0.4723004681291369,
+      "learning_rate": 8.800000000000001e-05,
+      "loss": 0.8406,
+      "step": 33
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.5102009004568578,
+      "learning_rate": 9.066666666666667e-05,
+      "loss": 0.82,
+      "step": 34
+    },
+    {
+      "epoch": 0.014,
+      "grad_norm": 0.4534745152781649,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.7999,
+      "step": 35
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.44852670349819795,
+      "learning_rate": 9.6e-05,
+      "loss": 0.8555,
+      "step": 36
+    },
+    {
+      "epoch": 0.0148,
+      "grad_norm": 0.4832952532334212,
+      "learning_rate": 9.866666666666668e-05,
+      "loss": 0.8648,
+      "step": 37
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.47705929417929455,
+      "learning_rate": 0.00010133333333333335,
+      "loss": 0.757,
+      "step": 38
+    },
+    {
+      "epoch": 0.0156,
+      "grad_norm": 0.4607455254160932,
+      "learning_rate": 0.00010400000000000001,
+      "loss": 0.8459,
+      "step": 39
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.4626227212847935,
+      "learning_rate": 0.00010666666666666667,
+      "loss": 0.8004,
+      "step": 40
+    },
+    {
+      "epoch": 0.0164,
+      "grad_norm": 0.4886210667054391,
+      "learning_rate": 0.00010933333333333333,
+      "loss": 0.796,
+      "step": 41
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.45953131550049225,
+      "learning_rate": 0.00011200000000000001,
+      "loss": 0.7777,
+      "step": 42
+    },
+    {
+      "epoch": 0.0172,
+      "grad_norm": 0.4988337626973592,
+      "learning_rate": 0.00011466666666666667,
+      "loss": 0.7889,
+      "step": 43
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.46646005480591024,
+      "learning_rate": 0.00011733333333333334,
+      "loss": 0.8255,
+      "step": 44
+    },
+    {
+      "epoch": 0.018,
+      "grad_norm": 0.45796241326093257,
+      "learning_rate": 0.00012,
+      "loss": 0.8138,
+      "step": 45
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5007827146329478,
+      "learning_rate": 0.00012266666666666668,
+      "loss": 0.8442,
+      "step": 46
+    },
+    {
+      "epoch": 0.0188,
+      "grad_norm": 0.4353271252466811,
+      "learning_rate": 0.00012533333333333334,
+      "loss": 0.801,
+      "step": 47
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.45968526845933466,
+      "learning_rate": 0.00012800000000000002,
+      "loss": 0.7364,
+      "step": 48
+    },
+    {
+      "epoch": 0.0196,
+      "grad_norm": 0.49745369949682966,
+      "learning_rate": 0.00013066666666666668,
+      "loss": 0.8031,
+      "step": 49
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.44668329837307647,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.7674,
+      "step": 50
+    },
+    {
+      "epoch": 0.0204,
+      "grad_norm": 0.43655839767863414,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 0.8486,
+      "step": 51
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.472683113679215,
+      "learning_rate": 0.00013866666666666669,
+      "loss": 0.787,
+      "step": 52
+    },
+    {
+      "epoch": 0.0212,
+      "grad_norm": 0.450526582200461,
+      "learning_rate": 0.00014133333333333334,
+      "loss": 0.8041,
+      "step": 53
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.5768027325528573,
+      "learning_rate": 0.000144,
+      "loss": 0.7016,
+      "step": 54
+    },
+    {
+      "epoch": 0.022,
+      "grad_norm": 0.481586914103551,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.7489,
+      "step": 55
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.4505400211123096,
+      "learning_rate": 0.00014933333333333335,
+      "loss": 0.7228,
+      "step": 56
+    },
+    {
+      "epoch": 0.0228,
+      "grad_norm": 0.4542790336161104,
+      "learning_rate": 0.000152,
+      "loss": 0.7701,
+      "step": 57
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.44949587528522483,
+      "learning_rate": 0.00015466666666666667,
+      "loss": 0.7668,
+      "step": 58
+    },
+    {
+      "epoch": 0.0236,
+      "grad_norm": 0.4351243243946112,
+      "learning_rate": 0.00015733333333333333,
+      "loss": 0.826,
+      "step": 59
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4636031767973824,
+      "learning_rate": 0.00016,
+      "loss": 0.8101,
+      "step": 60
+    },
+    {
+      "epoch": 0.0244,
+      "grad_norm": 0.4523286961925652,
+      "learning_rate": 0.00016266666666666667,
+      "loss": 0.7196,
+      "step": 61
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.4411015337759786,
+      "learning_rate": 0.00016533333333333333,
+      "loss": 0.8573,
+      "step": 62
+    },
+    {
+      "epoch": 0.0252,
+      "grad_norm": 0.44374281606353655,
+      "learning_rate": 0.000168,
+      "loss": 0.8284,
+      "step": 63
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.43205787306575816,
+      "learning_rate": 0.00017066666666666668,
+      "loss": 0.7832,
+      "step": 64
+    },
+    {
+      "epoch": 0.026,
+      "grad_norm": 0.4500334614959462,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.8236,
+      "step": 65
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.49176342841903986,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 0.8431,
+      "step": 66
+    },
+    {
+      "epoch": 0.0268,
+      "grad_norm": 0.4171536469667789,
+      "learning_rate": 0.00017866666666666668,
+      "loss": 0.7936,
+      "step": 67
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.4441975548154934,
+      "learning_rate": 0.00018133333333333334,
+      "loss": 0.7874,
+      "step": 68
+    },
+    {
+      "epoch": 0.0276,
+      "grad_norm": 0.42173891568743316,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 0.7767,
+      "step": 69
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.43152248333642174,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.8205,
+      "step": 70
+    },
+    {
+      "epoch": 0.0284,
+      "grad_norm": 0.43754948754338047,
+      "learning_rate": 0.00018933333333333335,
+      "loss": 0.7883,
+      "step": 71
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4510713409723826,
+      "learning_rate": 0.000192,
+      "loss": 0.7889,
+      "step": 72
+    },
+    {
+      "epoch": 0.0292,
+      "grad_norm": 0.43305456062703274,
+      "learning_rate": 0.0001946666666666667,
+      "loss": 0.7761,
+      "step": 73
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.44421606021947135,
+      "learning_rate": 0.00019733333333333335,
+      "loss": 0.8069,
+      "step": 74
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.4374806718919075,
+      "learning_rate": 0.0002,
+      "loss": 0.7374,
+      "step": 75
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.43669138824870574,
+      "learning_rate": 0.00019999991608372393,
+      "loss": 0.7494,
+      "step": 76
+    },
+    {
+      "epoch": 0.0308,
+      "grad_norm": 0.47021568540914727,
+      "learning_rate": 0.00019999966433503652,
+      "loss": 0.7765,
+      "step": 77
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.42215672127392045,
+      "learning_rate": 0.0001999992447543603,
+      "loss": 0.7376,
+      "step": 78
+    },
+    {
+      "epoch": 0.0316,
+      "grad_norm": 0.4550168133398646,
+      "learning_rate": 0.00019999865734239946,
+      "loss": 0.8398,
+      "step": 79
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4257398400765521,
+      "learning_rate": 0.00019999790210013988,
+      "loss": 0.7967,
+      "step": 80
+    },
+    {
+      "epoch": 0.0324,
+      "grad_norm": 0.46493621948454705,
+      "learning_rate": 0.0001999969790288491,
+      "loss": 0.7052,
+      "step": 81
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.41987088411626927,
+      "learning_rate": 0.00019999588813007633,
+      "loss": 0.7497,
+      "step": 82
+    },
+    {
+      "epoch": 0.0332,
+      "grad_norm": 0.4604213663969049,
+      "learning_rate": 0.00019999462940565243,
+      "loss": 0.7872,
+      "step": 83
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4256343198719839,
+      "learning_rate": 0.00019999320285769,
+      "loss": 0.8006,
+      "step": 84
+    },
+    {
+      "epoch": 0.034,
+      "grad_norm": 0.44084009183711087,
+      "learning_rate": 0.0001999916084885832,
+      "loss": 0.8113,
+      "step": 85
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.4485125928528638,
+      "learning_rate": 0.00019998984630100792,
+      "loss": 0.7894,
+      "step": 86
+    },
+    {
+      "epoch": 0.0348,
+      "grad_norm": 0.43001354176382595,
+      "learning_rate": 0.0001999879162979217,
+      "loss": 0.7267,
+      "step": 87
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.42469727606498964,
+      "learning_rate": 0.0001999858184825637,
+      "loss": 0.7284,
+      "step": 88
+    },
+    {
+      "epoch": 0.0356,
+      "grad_norm": 0.42522972576982493,
+      "learning_rate": 0.00019998355285845475,
+      "loss": 0.7954,
+      "step": 89
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.42623386533074803,
+      "learning_rate": 0.0001999811194293973,
+      "loss": 0.7646,
+      "step": 90
+    },
+    {
+      "epoch": 0.0364,
+      "grad_norm": 0.4419344880281263,
+      "learning_rate": 0.00019997851819947537,
+      "loss": 0.8297,
+      "step": 91
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4152362718412211,
+      "learning_rate": 0.00019997574917305478,
+      "loss": 0.7784,
+      "step": 92
+    },
+    {
+      "epoch": 0.0372,
+      "grad_norm": 0.4232230671800955,
+      "learning_rate": 0.00019997281235478278,
+      "loss": 0.7855,
+      "step": 93
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.41852191170507774,
+      "learning_rate": 0.00019996970774958836,
+      "loss": 0.7304,
+      "step": 94
+    },
+    {
+      "epoch": 0.038,
+      "grad_norm": 0.44716788452141526,
+      "learning_rate": 0.00019996643536268204,
+      "loss": 0.7658,
+      "step": 95
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4255123756712993,
+      "learning_rate": 0.0001999629951995559,
+      "loss": 0.7895,
+      "step": 96
+    },
+    {
+      "epoch": 0.0388,
+      "grad_norm": 0.4442132358144533,
+      "learning_rate": 0.00019995938726598373,
+      "loss": 0.7595,
+      "step": 97
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.41829577238427884,
+      "learning_rate": 0.00019995561156802079,
+      "loss": 0.6954,
+      "step": 98
+    },
+    {
+      "epoch": 0.0396,
+      "grad_norm": 0.4158609423759713,
+      "learning_rate": 0.0001999516681120039,
+      "loss": 0.7325,
+      "step": 99
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.4316103491720492,
+      "learning_rate": 0.00019994755690455152,
+      "loss": 0.7705,
+      "step": 100
+    },
+    {
+      "epoch": 0.0404,
+      "grad_norm": 0.40851905497651775,
+      "learning_rate": 0.0001999432779525635,
+      "loss": 0.7624,
+      "step": 101
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.4319653076801545,
+      "learning_rate": 0.0001999388312632214,
+      "loss": 0.7191,
+      "step": 102
+    },
+    {
+      "epoch": 0.0412,
+      "grad_norm": 0.6966142087903838,
+      "learning_rate": 0.00019993421684398824,
+      "loss": 0.7768,
+      "step": 103
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.41710643293646915,
+      "learning_rate": 0.00019992943470260844,
+      "loss": 0.7996,
+      "step": 104
+    },
+    {
+      "epoch": 0.042,
+      "grad_norm": 0.4110556419161805,
+      "learning_rate": 0.00019992448484710797,
+      "loss": 0.7359,
+      "step": 105
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.5156907227676272,
+      "learning_rate": 0.00019991936728579437,
+      "loss": 0.7905,
+      "step": 106
+    },
+    {
+      "epoch": 0.0428,
+      "grad_norm": 0.42197055785073717,
+      "learning_rate": 0.00019991408202725655,
+      "loss": 0.7386,
+      "step": 107
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.43155124246288074,
+      "learning_rate": 0.0001999086290803649,
+      "loss": 0.7079,
+      "step": 108
+    },
+    {
+      "epoch": 0.0436,
+      "grad_norm": 0.41500836404692715,
+      "learning_rate": 0.00019990300845427125,
+      "loss": 0.7573,
+      "step": 109
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.4277723496639902,
+      "learning_rate": 0.0001998972201584088,
+      "loss": 0.7001,
+      "step": 110
+    },
+    {
+      "epoch": 0.0444,
+      "grad_norm": 0.4203089960191111,
+      "learning_rate": 0.00019989126420249221,
+      "loss": 0.747,
+      "step": 111
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.40083853943140735,
+      "learning_rate": 0.00019988514059651752,
+      "loss": 0.7518,
+      "step": 112
+    },
+    {
+      "epoch": 0.0452,
+      "grad_norm": 0.4271473061997102,
+      "learning_rate": 0.00019987884935076213,
+      "loss": 0.7423,
+      "step": 113
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.4447134275311075,
+      "learning_rate": 0.00019987239047578482,
+      "loss": 0.7603,
+      "step": 114
+    },
+    {
+      "epoch": 0.046,
+      "grad_norm": 0.4141406035356386,
+      "learning_rate": 0.00019986576398242566,
+      "loss": 0.7072,
+      "step": 115
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.40561074396013835,
+      "learning_rate": 0.00019985896988180605,
+      "loss": 0.7718,
+      "step": 116
+    },
+    {
+      "epoch": 0.0468,
+      "grad_norm": 0.40220017292785776,
+      "learning_rate": 0.00019985200818532875,
+      "loss": 0.7502,
+      "step": 117
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.4233888412820622,
+      "learning_rate": 0.0001998448789046777,
+      "loss": 0.7652,
+      "step": 118
+    },
+    {
+      "epoch": 0.0476,
+      "grad_norm": 0.41629114657404,
+      "learning_rate": 0.00019983758205181822,
+      "loss": 0.763,
+      "step": 119
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4334543642986752,
+      "learning_rate": 0.00019983011763899673,
+      "loss": 0.7573,
+      "step": 120
+    },
+    {
+      "epoch": 0.0484,
+      "grad_norm": 0.44993886253918425,
+      "learning_rate": 0.00019982248567874098,
+      "loss": 0.7478,
+      "step": 121
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.4055243046386066,
+      "learning_rate": 0.00019981468618385988,
+      "loss": 0.7378,
+      "step": 122
+    },
+    {
+      "epoch": 0.0492,
+      "grad_norm": 0.4413650244469547,
+      "learning_rate": 0.00019980671916744352,
+      "loss": 0.6988,
+      "step": 123
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.44282863942194783,
+      "learning_rate": 0.00019979858464286317,
+      "loss": 0.7739,
+      "step": 124
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.4003454200231535,
+      "learning_rate": 0.00019979028262377118,
+      "loss": 0.7124,
+      "step": 125
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.42277199793975706,
+      "learning_rate": 0.00019978181312410104,
+      "loss": 0.8094,
+      "step": 126
+    },
+    {
+      "epoch": 0.0508,
+      "grad_norm": 0.42727006528696454,
+      "learning_rate": 0.00019977317615806737,
+      "loss": 0.7637,
+      "step": 127
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4194902391741022,
+      "learning_rate": 0.00019976437174016573,
+      "loss": 0.7225,
+      "step": 128
+    },
+    {
+      "epoch": 0.0516,
+      "grad_norm": 0.4083313507877906,
+      "learning_rate": 0.00019975539988517288,
+      "loss": 0.6898,
+      "step": 129
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.4038095107764052,
+      "learning_rate": 0.00019974626060814647,
+      "loss": 0.7255,
+      "step": 130
+    },
+    {
+      "epoch": 0.0524,
+      "grad_norm": 0.40295378634918333,
+      "learning_rate": 0.0001997369539244252,
+      "loss": 0.7044,
+      "step": 131
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.42946977035313305,
+      "learning_rate": 0.0001997274798496287,
+      "loss": 0.8355,
+      "step": 132
+    },
+    {
+      "epoch": 0.0532,
+      "grad_norm": 0.45280745176400994,
+      "learning_rate": 0.00019971783839965756,
+      "loss": 0.8345,
+      "step": 133
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.40812858974254734,
+      "learning_rate": 0.00019970802959069328,
+      "loss": 0.7534,
+      "step": 134
+    },
+    {
+      "epoch": 0.054,
+      "grad_norm": 0.4005018025383052,
+      "learning_rate": 0.00019969805343919821,
+      "loss": 0.7328,
+      "step": 135
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.41213665680910816,
+      "learning_rate": 0.0001996879099619156,
+      "loss": 0.708,
+      "step": 136
+    },
+    {
+      "epoch": 0.0548,
+      "grad_norm": 0.3831675511800143,
+      "learning_rate": 0.00019967759917586953,
+      "loss": 0.7062,
+      "step": 137
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.4095960002451829,
+      "learning_rate": 0.00019966712109836476,
+      "loss": 0.7307,
+      "step": 138
+    },
+    {
+      "epoch": 0.0556,
+      "grad_norm": 0.4132300546181534,
+      "learning_rate": 0.000199656475746987,
+      "loss": 0.7887,
+      "step": 139
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.397649037372497,
+      "learning_rate": 0.00019964566313960264,
+      "loss": 0.7265,
+      "step": 140
+    },
+    {
+      "epoch": 0.0564,
+      "grad_norm": 0.42120826285768476,
+      "learning_rate": 0.0001996346832943587,
+      "loss": 0.7836,
+      "step": 141
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.4342001860001955,
+      "learning_rate": 0.00019962353622968295,
+      "loss": 0.709,
+      "step": 142
+    },
+    {
+      "epoch": 0.0572,
+      "grad_norm": 0.4123432548955716,
+      "learning_rate": 0.00019961222196428378,
+      "loss": 0.767,
+      "step": 143
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4145218974145123,
+      "learning_rate": 0.0001996007405171502,
+      "loss": 0.7571,
+      "step": 144
+    },
+    {
+      "epoch": 0.058,
+      "grad_norm": 0.4195605987194582,
+      "learning_rate": 0.00019958909190755187,
+      "loss": 0.7503,
+      "step": 145
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.408252075709149,
+      "learning_rate": 0.00019957727615503888,
+      "loss": 0.7169,
+      "step": 146
+    },
+    {
+      "epoch": 0.0588,
+      "grad_norm": 0.37858547837823253,
+      "learning_rate": 0.00019956529327944198,
+      "loss": 0.7304,
+      "step": 147
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.41152287217885664,
+      "learning_rate": 0.00019955314330087225,
+      "loss": 0.7074,
+      "step": 148
+    },
+    {
+      "epoch": 0.0596,
+      "grad_norm": 0.39588672202028524,
+      "learning_rate": 0.00019954082623972142,
+      "loss": 0.7444,
+      "step": 149
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.423265875933927,
+      "learning_rate": 0.0001995283421166614,
+      "loss": 0.7311,
+      "step": 150
+    },
+    {
+      "epoch": 0.0604,
+      "grad_norm": 0.4117989229058825,
+      "learning_rate": 0.00019951569095264473,
+      "loss": 0.7431,
+      "step": 151
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4097653391835411,
+      "learning_rate": 0.0001995028727689041,
+      "loss": 0.7139,
+      "step": 152
+    },
+    {
+      "epoch": 0.0612,
+      "grad_norm": 0.4080502432875463,
+      "learning_rate": 0.00019948988758695263,
+      "loss": 0.7546,
+      "step": 153
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.4289198634795377,
+      "learning_rate": 0.00019947673542858367,
+      "loss": 0.7418,
+      "step": 154
+    },
+    {
+      "epoch": 0.062,
+      "grad_norm": 0.4053067174061218,
+      "learning_rate": 0.00019946341631587087,
+      "loss": 0.7417,
+      "step": 155
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4149451768425185,
+      "learning_rate": 0.00019944993027116797,
+      "loss": 0.7538,
+      "step": 156
+    },
+    {
+      "epoch": 0.0628,
+      "grad_norm": 0.3902632133129172,
+      "learning_rate": 0.00019943627731710897,
+      "loss": 0.7075,
+      "step": 157
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.40528543888587293,
+      "learning_rate": 0.00019942245747660796,
+      "loss": 0.7732,
+      "step": 158
+    },
+    {
+      "epoch": 0.0636,
+      "grad_norm": 0.39075970939435,
+      "learning_rate": 0.00019940847077285916,
+      "loss": 0.7051,
+      "step": 159
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.39548018332760576,
+      "learning_rate": 0.0001993943172293368,
+      "loss": 0.7164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0644,
+      "grad_norm": 0.39318742210695906,
+      "learning_rate": 0.0001993799968697951,
+      "loss": 0.6935,
+      "step": 161
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.39855227114585756,
+      "learning_rate": 0.00019936550971826834,
+      "loss": 0.7697,
+      "step": 162
+    },
+    {
+      "epoch": 0.0652,
+      "grad_norm": 0.38947866213700505,
+      "learning_rate": 0.00019935085579907063,
+      "loss": 0.7061,
+      "step": 163
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.42041126259328787,
+      "learning_rate": 0.00019933603513679605,
+      "loss": 0.7051,
+      "step": 164
+    },
+    {
+      "epoch": 0.066,
+      "grad_norm": 0.38933284000032525,
+      "learning_rate": 0.00019932104775631846,
+      "loss": 0.7598,
+      "step": 165
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.40483887852504585,
+      "learning_rate": 0.0001993058936827916,
+      "loss": 0.7265,
+      "step": 166
+    },
+    {
+      "epoch": 0.0668,
+      "grad_norm": 0.39830833640345015,
+      "learning_rate": 0.00019929057294164893,
+      "loss": 0.7151,
+      "step": 167
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4115135101352071,
+      "learning_rate": 0.0001992750855586036,
+      "loss": 0.7502,
+      "step": 168
+    },
+    {
+      "epoch": 0.0676,
+      "grad_norm": 0.38172900244707286,
+      "learning_rate": 0.00019925943155964856,
+      "loss": 0.7247,
+      "step": 169
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.379583113248406,
+      "learning_rate": 0.00019924361097105623,
+      "loss": 0.7005,
+      "step": 170
+    },
+    {
+      "epoch": 0.0684,
+      "grad_norm": 0.4500629854402789,
+      "learning_rate": 0.00019922762381937878,
+      "loss": 0.791,
+      "step": 171
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4119696775639509,
+      "learning_rate": 0.0001992114701314478,
+      "loss": 0.6956,
+      "step": 172
+    },
+    {
+      "epoch": 0.0692,
+      "grad_norm": 0.42835121955460526,
+      "learning_rate": 0.00019919514993437445,
+      "loss": 0.6814,
+      "step": 173
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.4078948690763593,
+      "learning_rate": 0.00019917866325554938,
+      "loss": 0.7347,
+      "step": 174
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.4110342906921341,
+      "learning_rate": 0.00019916201012264254,
+      "loss": 0.7335,
+      "step": 175
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.41830987692511135,
+      "learning_rate": 0.0001991451905636033,
+      "loss": 0.7822,
+      "step": 176
+    },
+    {
+      "epoch": 0.0708,
+      "grad_norm": 0.39501674747254456,
+      "learning_rate": 0.00019912820460666044,
+      "loss": 0.7925,
+      "step": 177
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.42899841467332,
+      "learning_rate": 0.00019911105228032186,
+      "loss": 0.7735,
+      "step": 178
+    },
+    {
+      "epoch": 0.0716,
+      "grad_norm": 0.41271689167378905,
+      "learning_rate": 0.00019909373361337476,
+      "loss": 0.725,
+      "step": 179
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.3834594071228448,
+      "learning_rate": 0.0001990762486348855,
+      "loss": 0.7488,
+      "step": 180
+    },
+    {
+      "epoch": 0.0724,
+      "grad_norm": 0.40870320889373035,
+      "learning_rate": 0.00019905859737419956,
+      "loss": 0.742,
+      "step": 181
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.39509791257797366,
+      "learning_rate": 0.00019904077986094152,
+      "loss": 0.7336,
+      "step": 182
+    },
+    {
+      "epoch": 0.0732,
+      "grad_norm": 0.43168076295692787,
+      "learning_rate": 0.00019902279612501493,
+      "loss": 0.7687,
+      "step": 183
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.40758738331074995,
+      "learning_rate": 0.0001990046461966024,
+      "loss": 0.7744,
+      "step": 184
+    },
+    {
+      "epoch": 0.074,
+      "grad_norm": 0.4046682465439113,
+      "learning_rate": 0.00019898633010616542,
+      "loss": 0.7161,
+      "step": 185
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.3984141736259908,
+      "learning_rate": 0.0001989678478844443,
+      "loss": 0.7172,
+      "step": 186
+    },
+    {
+      "epoch": 0.0748,
+      "grad_norm": 0.4101332832422778,
+      "learning_rate": 0.00019894919956245824,
+      "loss": 0.7378,
+      "step": 187
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4041894850957024,
+      "learning_rate": 0.00019893038517150525,
+      "loss": 0.7698,
+      "step": 188
+    },
+    {
+      "epoch": 0.0756,
+      "grad_norm": 0.3756720407067433,
+      "learning_rate": 0.00019891140474316194,
+      "loss": 0.7126,
+      "step": 189
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.393690580210608,
+      "learning_rate": 0.00019889225830928365,
+      "loss": 0.7162,
+      "step": 190
+    },
+    {
+      "epoch": 0.0764,
+      "grad_norm": 0.409859652021738,
+      "learning_rate": 0.00019887294590200435,
+      "loss": 0.7246,
+      "step": 191
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.39567494207743276,
+      "learning_rate": 0.00019885346755373656,
+      "loss": 0.7754,
+      "step": 192
+    },
+    {
+      "epoch": 0.0772,
+      "grad_norm": 0.40662939057511427,
+      "learning_rate": 0.00019883382329717128,
+      "loss": 0.7528,
+      "step": 193
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.4008250299398194,
+      "learning_rate": 0.00019881401316527793,
+      "loss": 0.7124,
+      "step": 194
+    },
+    {
+      "epoch": 0.078,
+      "grad_norm": 0.4032947178650355,
+      "learning_rate": 0.0001987940371913044,
+      "loss": 0.7153,
+      "step": 195
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.3903992167133848,
+      "learning_rate": 0.00019877389540877687,
+      "loss": 0.7283,
+      "step": 196
+    },
+    {
+      "epoch": 0.0788,
+      "grad_norm": 0.41517569778134544,
+      "learning_rate": 0.0001987535878514998,
+      "loss": 0.7359,
+      "step": 197
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.41301040465889627,
+      "learning_rate": 0.0001987331145535559,
+      "loss": 0.707,
+      "step": 198
+    },
+    {
+      "epoch": 0.0796,
+      "grad_norm": 0.4135534682003512,
+      "learning_rate": 0.000198712475549306,
+      "loss": 0.686,
+      "step": 199
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.42377098811209635,
+      "learning_rate": 0.00019869167087338907,
+      "loss": 0.7537,
+      "step": 200
+    },
+    {
+      "epoch": 0.0804,
+      "grad_norm": 0.40895896445777713,
+      "learning_rate": 0.00019867070056072214,
+      "loss": 0.7311,
+      "step": 201
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.40915037278205396,
+      "learning_rate": 0.00019864956464650025,
+      "loss": 0.7449,
+      "step": 202
+    },
+    {
+      "epoch": 0.0812,
+      "grad_norm": 0.40445847337509294,
+      "learning_rate": 0.00019862826316619628,
+      "loss": 0.7158,
+      "step": 203
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.46618903941937756,
+      "learning_rate": 0.0001986067961555611,
+      "loss": 0.718,
+      "step": 204
+    },
+    {
+      "epoch": 0.082,
+      "grad_norm": 0.4046909719778866,
+      "learning_rate": 0.00019858516365062334,
+      "loss": 0.7036,
+      "step": 205
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.41548282817470816,
+      "learning_rate": 0.00019856336568768935,
+      "loss": 0.7487,
+      "step": 206
+    },
+    {
+      "epoch": 0.0828,
+      "grad_norm": 0.4170338736156831,
+      "learning_rate": 0.00019854140230334322,
+      "loss": 0.7642,
+      "step": 207
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3703809957686201,
+      "learning_rate": 0.0001985192735344467,
+      "loss": 0.7279,
+      "step": 208
+    },
+    {
+      "epoch": 0.0836,
+      "grad_norm": 0.4313534925981791,
+      "learning_rate": 0.00019849697941813898,
+      "loss": 0.7603,
+      "step": 209
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.3967111981874945,
+      "learning_rate": 0.00019847451999183694,
+      "loss": 0.6939,
+      "step": 210
+    },
+    {
+      "epoch": 0.0844,
+      "grad_norm": 0.39530819623251945,
+      "learning_rate": 0.00019845189529323475,
+      "loss": 0.754,
+      "step": 211
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.40409630943783886,
+      "learning_rate": 0.00019842910536030403,
+      "loss": 0.7026,
+      "step": 212
+    },
+    {
+      "epoch": 0.0852,
+      "grad_norm": 0.40861421773390033,
+      "learning_rate": 0.00019840615023129372,
+      "loss": 0.7411,
+      "step": 213
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.40743970735535945,
+      "learning_rate": 0.00019838302994472997,
+      "loss": 0.7416,
+      "step": 214
+    },
+    {
+      "epoch": 0.086,
+      "grad_norm": 0.4149156974194457,
+      "learning_rate": 0.0001983597445394162,
+      "loss": 0.7416,
+      "step": 215
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.40827125586244023,
+      "learning_rate": 0.00019833629405443284,
+      "loss": 0.6797,
+      "step": 216
+    },
+    {
+      "epoch": 0.0868,
+      "grad_norm": 0.3991174587727073,
+      "learning_rate": 0.0001983126785291375,
+      "loss": 0.7051,
+      "step": 217
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.3995736742692105,
+      "learning_rate": 0.00019828889800316466,
+      "loss": 0.6868,
+      "step": 218
+    },
+    {
+      "epoch": 0.0876,
+      "grad_norm": 0.4096471682734239,
+      "learning_rate": 0.00019826495251642578,
+      "loss": 0.7455,
+      "step": 219
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4143782757620809,
+      "learning_rate": 0.00019824084210910925,
+      "loss": 0.7631,
+      "step": 220
+    },
+    {
+      "epoch": 0.0884,
+      "grad_norm": 0.3849725243870913,
+      "learning_rate": 0.00019821656682168012,
+      "loss": 0.7172,
+      "step": 221
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.4145335757065706,
+      "learning_rate": 0.00019819212669488026,
+      "loss": 0.7241,
+      "step": 222
+    },
+    {
+      "epoch": 0.0892,
+      "grad_norm": 0.42576198570162194,
+      "learning_rate": 0.00019816752176972813,
+      "loss": 0.7725,
+      "step": 223
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.39758444622346295,
+      "learning_rate": 0.0001981427520875188,
+      "loss": 0.7016,
+      "step": 224
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.39097805447968603,
+      "learning_rate": 0.0001981178176898239,
+      "loss": 0.7032,
+      "step": 225
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.38693500061254904,
+      "learning_rate": 0.00019809271861849145,
+      "loss": 0.6888,
+      "step": 226
+    },
+    {
+      "epoch": 0.0908,
+      "grad_norm": 0.405762505678354,
+      "learning_rate": 0.00019806745491564586,
+      "loss": 0.6475,
+      "step": 227
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4041890904067067,
+      "learning_rate": 0.0001980420266236878,
+      "loss": 0.6854,
+      "step": 228
+    },
+    {
+      "epoch": 0.0916,
+      "grad_norm": 0.4070188903338878,
+      "learning_rate": 0.0001980164337852943,
+      "loss": 0.7296,
+      "step": 229
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.3943738247800487,
+      "learning_rate": 0.00019799067644341844,
+      "loss": 0.6877,
+      "step": 230
+    },
+    {
+      "epoch": 0.0924,
+      "grad_norm": 0.3879611737937716,
+      "learning_rate": 0.00019796475464128942,
+      "loss": 0.7096,
+      "step": 231
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.41252676247575876,
+      "learning_rate": 0.00019793866842241243,
+      "loss": 0.7955,
+      "step": 232
+    },
+    {
+      "epoch": 0.0932,
+      "grad_norm": 0.3731003151411376,
+      "learning_rate": 0.00019791241783056874,
+      "loss": 0.6998,
+      "step": 233
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.41307635587280533,
+      "learning_rate": 0.00019788600290981525,
+      "loss": 0.7288,
+      "step": 234
+    },
+    {
+      "epoch": 0.094,
+      "grad_norm": 0.39841410187280835,
+      "learning_rate": 0.0001978594237044849,
+      "loss": 0.731,
+      "step": 235
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.39306819371338675,
+      "learning_rate": 0.0001978326802591862,
+      "loss": 0.7462,
+      "step": 236
+    },
+    {
+      "epoch": 0.0948,
+      "grad_norm": 0.4173972279803122,
+      "learning_rate": 0.00019780577261880336,
+      "loss": 0.7252,
+      "step": 237
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.3734715331431929,
+      "learning_rate": 0.0001977787008284962,
+      "loss": 0.7047,
+      "step": 238
+    },
+    {
+      "epoch": 0.0956,
+      "grad_norm": 0.40155801467397784,
+      "learning_rate": 0.00019775146493369994,
+      "loss": 0.7478,
+      "step": 239
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4262333677432803,
+      "learning_rate": 0.0001977240649801253,
+      "loss": 0.7816,
+      "step": 240
+    },
+    {
+      "epoch": 0.0964,
+      "grad_norm": 0.391277356302728,
+      "learning_rate": 0.00019769650101375837,
+      "loss": 0.6988,
+      "step": 241
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.40394930428709697,
+      "learning_rate": 0.00019766877308086036,
+      "loss": 0.7343,
+      "step": 242
+    },
+    {
+      "epoch": 0.0972,
+      "grad_norm": 0.4730053935130541,
+      "learning_rate": 0.00019764088122796783,
+      "loss": 0.7479,
+      "step": 243
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.38553115270003163,
+      "learning_rate": 0.0001976128255018924,
+      "loss": 0.7224,
+      "step": 244
+    },
+    {
+      "epoch": 0.098,
+      "grad_norm": 0.3942639089554025,
+      "learning_rate": 0.00019758460594972068,
+      "loss": 0.6645,
+      "step": 245
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.4086576183632301,
+      "learning_rate": 0.00019755622261881427,
+      "loss": 0.7888,
+      "step": 246
+    },
+    {
+      "epoch": 0.0988,
+      "grad_norm": 0.402016469304673,
+      "learning_rate": 0.00019752767555680968,
+      "loss": 0.7368,
+      "step": 247
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3881004687395709,
+      "learning_rate": 0.00019749896481161808,
+      "loss": 0.7491,
+      "step": 248
+    },
+    {
+      "epoch": 0.0996,
+      "grad_norm": 0.4019891978415487,
+      "learning_rate": 0.00019747009043142555,
+      "loss": 0.7185,
+      "step": 249
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.378323402032098,
+      "learning_rate": 0.00019744105246469263,
+      "loss": 0.7158,
+      "step": 250
+    },
+    {
+      "epoch": 0.1004,
+      "grad_norm": 0.3906032109598485,
+      "learning_rate": 0.00019741185096015448,
+      "loss": 0.731,
+      "step": 251
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.38837749622029527,
+      "learning_rate": 0.00019738248596682078,
+      "loss": 0.6796,
+      "step": 252
+    },
+    {
+      "epoch": 0.1012,
+      "grad_norm": 0.37230188694518274,
+      "learning_rate": 0.0001973529575339755,
+      "loss": 0.6871,
+      "step": 253
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.41085346510849274,
+      "learning_rate": 0.00019732326571117703,
+      "loss": 0.7151,
+      "step": 254
+    },
+    {
+      "epoch": 0.102,
+      "grad_norm": 0.39845782119378576,
+      "learning_rate": 0.00019729341054825782,
+      "loss": 0.7581,
+      "step": 255
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.40171006946231647,
+      "learning_rate": 0.00019726339209532462,
+      "loss": 0.7216,
+      "step": 256
+    },
+    {
+      "epoch": 0.1028,
+      "grad_norm": 0.41249135755171695,
+      "learning_rate": 0.00019723321040275815,
+      "loss": 0.7617,
+      "step": 257
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.38194977115100603,
+      "learning_rate": 0.0001972028655212131,
+      "loss": 0.7661,
+      "step": 258
+    },
+    {
+      "epoch": 0.1036,
+      "grad_norm": 0.3927113415894503,
+      "learning_rate": 0.00019717235750161806,
+      "loss": 0.6845,
+      "step": 259
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4184396270944774,
+      "learning_rate": 0.00019714168639517544,
+      "loss": 0.6976,
+      "step": 260
+    },
+    {
+      "epoch": 0.1044,
+      "grad_norm": 0.3929516613812163,
+      "learning_rate": 0.00019711085225336132,
+      "loss": 0.6956,
+      "step": 261
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.3910654862881255,
+      "learning_rate": 0.00019707985512792543,
+      "loss": 0.6977,
+      "step": 262
+    },
+    {
+      "epoch": 0.1052,
+      "grad_norm": 0.3876378264776691,
+      "learning_rate": 0.00019704869507089105,
+      "loss": 0.7256,
+      "step": 263
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.3697637416688594,
+      "learning_rate": 0.0001970173721345549,
+      "loss": 0.6674,
+      "step": 264
+    },
+    {
+      "epoch": 0.106,
+      "grad_norm": 0.4147680271429278,
+      "learning_rate": 0.00019698588637148703,
+      "loss": 0.7912,
+      "step": 265
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.40245644770019046,
+      "learning_rate": 0.00019695423783453088,
+      "loss": 0.7434,
+      "step": 266
+    },
+    {
+      "epoch": 0.1068,
+      "grad_norm": 0.39585106634137457,
+      "learning_rate": 0.00019692242657680286,
+      "loss": 0.696,
+      "step": 267
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.38206639064738385,
+      "learning_rate": 0.00019689045265169273,
+      "loss": 0.6787,
+      "step": 268
+    },
+    {
+      "epoch": 0.1076,
+      "grad_norm": 0.38659658884436554,
+      "learning_rate": 0.0001968583161128631,
+      "loss": 0.7411,
+      "step": 269
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.37684016450723856,
+      "learning_rate": 0.0001968260170142496,
+      "loss": 0.7033,
+      "step": 270
+    },
+    {
+      "epoch": 0.1084,
+      "grad_norm": 0.3929047202882145,
+      "learning_rate": 0.00019679355541006054,
+      "loss": 0.6944,
+      "step": 271
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.38317417460333625,
+      "learning_rate": 0.00019676093135477713,
+      "loss": 0.735,
+      "step": 272
+    },
+    {
+      "epoch": 0.1092,
+      "grad_norm": 0.3837467243669443,
+      "learning_rate": 0.0001967281449031531,
+      "loss": 0.6802,
+      "step": 273
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.3929277164168596,
+      "learning_rate": 0.00019669519611021486,
+      "loss": 0.7336,
+      "step": 274
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.39661746176816876,
+      "learning_rate": 0.00019666208503126112,
+      "loss": 0.7604,
+      "step": 275
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.39575484759133867,
+      "learning_rate": 0.00019662881172186313,
+      "loss": 0.7149,
+      "step": 276
+    },
+    {
+      "epoch": 0.1108,
+      "grad_norm": 0.380670072192301,
+      "learning_rate": 0.00019659537623786428,
+      "loss": 0.7044,
+      "step": 277
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.3664185447465465,
+      "learning_rate": 0.00019656177863538026,
+      "loss": 0.6484,
+      "step": 278
+    },
+    {
+      "epoch": 0.1116,
+      "grad_norm": 0.39537126352935253,
+      "learning_rate": 0.00019652801897079869,
+      "loss": 0.705,
+      "step": 279
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4000097402559313,
+      "learning_rate": 0.00019649409730077935,
+      "loss": 0.6854,
+      "step": 280
+    },
+    {
+      "epoch": 0.1124,
+      "grad_norm": 0.3926356257874463,
+      "learning_rate": 0.00019646001368225382,
+      "loss": 0.7619,
+      "step": 281
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.3904258346252303,
+      "learning_rate": 0.0001964257681724255,
+      "loss": 0.6844,
+      "step": 282
+    },
+    {
+      "epoch": 0.1132,
+      "grad_norm": 0.3868867524178238,
+      "learning_rate": 0.00019639136082876953,
+      "loss": 0.6828,
+      "step": 283
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.37141624945766505,
+      "learning_rate": 0.00019635679170903258,
+      "loss": 0.7094,
+      "step": 284
+    },
+    {
+      "epoch": 0.114,
+      "grad_norm": 0.36599494714533054,
+      "learning_rate": 0.00019632206087123296,
+      "loss": 0.6958,
+      "step": 285
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.4065700930147299,
+      "learning_rate": 0.00019628716837366027,
+      "loss": 0.7096,
+      "step": 286
+    },
+    {
+      "epoch": 0.1148,
+      "grad_norm": 0.40651433946028726,
+      "learning_rate": 0.00019625211427487548,
+      "loss": 0.6958,
+      "step": 287
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4232769181127487,
+      "learning_rate": 0.00019621689863371083,
+      "loss": 0.7201,
+      "step": 288
+    },
+    {
+      "epoch": 0.1156,
+      "grad_norm": 0.4127296931497552,
+      "learning_rate": 0.00019618152150926955,
+      "loss": 0.6296,
+      "step": 289
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.3882774288253915,
+      "learning_rate": 0.000196145982960926,
+      "loss": 0.7274,
+      "step": 290
+    },
+    {
+      "epoch": 0.1164,
+      "grad_norm": 0.4409139953301101,
+      "learning_rate": 0.00019611028304832546,
+      "loss": 0.812,
+      "step": 291
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.3878309341285608,
+      "learning_rate": 0.000196074421831384,
+      "loss": 0.6975,
+      "step": 292
+    },
+    {
+      "epoch": 0.1172,
+      "grad_norm": 0.386519880007628,
+      "learning_rate": 0.00019603839937028838,
+      "loss": 0.6707,
+      "step": 293
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.4067072595460183,
+      "learning_rate": 0.00019600221572549606,
+      "loss": 0.7579,
+      "step": 294
+    },
+    {
+      "epoch": 0.118,
+      "grad_norm": 0.38200579454885775,
+      "learning_rate": 0.00019596587095773495,
+      "loss": 0.7511,
+      "step": 295
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.39000018516209073,
+      "learning_rate": 0.00019592936512800342,
+      "loss": 0.7013,
+      "step": 296
+    },
+    {
+      "epoch": 0.1188,
+      "grad_norm": 0.3855048239013933,
+      "learning_rate": 0.00019589269829757008,
+      "loss": 0.6928,
+      "step": 297
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.4144883803849691,
+      "learning_rate": 0.00019585587052797389,
+      "loss": 0.6998,
+      "step": 298
+    },
+    {
+      "epoch": 0.1196,
+      "grad_norm": 0.4187340811173014,
+      "learning_rate": 0.00019581888188102375,
+      "loss": 0.7296,
+      "step": 299
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.43580089453193305,
+      "learning_rate": 0.00019578173241879872,
+      "loss": 0.773,
+      "step": 300
+    },
+    {
+      "epoch": 0.1204,
+      "grad_norm": 0.38469047116127925,
+      "learning_rate": 0.00019574442220364767,
+      "loss": 0.7402,
+      "step": 301
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.4056538975856685,
+      "learning_rate": 0.00019570695129818926,
+      "loss": 0.6336,
+      "step": 302
+    },
+    {
+      "epoch": 0.1212,
+      "grad_norm": 0.3803875541548997,
+      "learning_rate": 0.0001956693197653119,
+      "loss": 0.7296,
+      "step": 303
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.39397850043949445,
+      "learning_rate": 0.00019563152766817354,
+      "loss": 0.6881,
+      "step": 304
+    },
+    {
+      "epoch": 0.122,
+      "grad_norm": 0.3934750562850638,
+      "learning_rate": 0.00019559357507020162,
+      "loss": 0.7417,
+      "step": 305
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.35516981921786084,
+      "learning_rate": 0.00019555546203509297,
+      "loss": 0.6466,
+      "step": 306
+    },
+    {
+      "epoch": 0.1228,
+      "grad_norm": 0.3871342400238589,
+      "learning_rate": 0.00019551718862681364,
+      "loss": 0.699,
+      "step": 307
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.37122507606125155,
+      "learning_rate": 0.00019547875490959885,
+      "loss": 0.7068,
+      "step": 308
+    },
+    {
+      "epoch": 0.1236,
+      "grad_norm": 0.5062668465863249,
+      "learning_rate": 0.00019544016094795295,
+      "loss": 0.6881,
+      "step": 309
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.4088913754479982,
+      "learning_rate": 0.00019540140680664913,
+      "loss": 0.7788,
+      "step": 310
+    },
+    {
+      "epoch": 0.1244,
+      "grad_norm": 0.4080616458426694,
+      "learning_rate": 0.00019536249255072948,
+      "loss": 0.7358,
+      "step": 311
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.40713022875650223,
+      "learning_rate": 0.00019532341824550479,
+      "loss": 0.6608,
+      "step": 312
+    },
+    {
+      "epoch": 0.1252,
+      "grad_norm": 0.39832416169944584,
+      "learning_rate": 0.0001952841839565544,
+      "loss": 0.7263,
+      "step": 313
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.3832264350159671,
+      "learning_rate": 0.0001952447897497263,
+      "loss": 0.669,
+      "step": 314
+    },
+    {
+      "epoch": 0.126,
+      "grad_norm": 0.3875302932764967,
+      "learning_rate": 0.00019520523569113677,
+      "loss": 0.7009,
+      "step": 315
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.40318507984409185,
+      "learning_rate": 0.00019516552184717037,
+      "loss": 0.7279,
+      "step": 316
+    },
+    {
+      "epoch": 0.1268,
+      "grad_norm": 0.37726750077569626,
+      "learning_rate": 0.00019512564828447988,
+      "loss": 0.6794,
+      "step": 317
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.40181893918441336,
+      "learning_rate": 0.0001950856150699861,
+      "loss": 0.7269,
+      "step": 318
+    },
+    {
+      "epoch": 0.1276,
+      "grad_norm": 0.4019533837773805,
+      "learning_rate": 0.0001950454222708778,
+      "loss": 0.651,
+      "step": 319
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3797291101627864,
+      "learning_rate": 0.0001950050699546116,
+      "loss": 0.7245,
+      "step": 320
+    },
+    {
+      "epoch": 0.1284,
+      "grad_norm": 0.4041662776983708,
+      "learning_rate": 0.0001949645581889118,
+      "loss": 0.7394,
+      "step": 321
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.3907511069014927,
+      "learning_rate": 0.00019492388704177036,
+      "loss": 0.7226,
+      "step": 322
+    },
+    {
+      "epoch": 0.1292,
+      "grad_norm": 0.367624741020837,
+      "learning_rate": 0.00019488305658144667,
+      "loss": 0.7047,
+      "step": 323
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.3939626503174779,
+      "learning_rate": 0.00019484206687646753,
+      "loss": 0.7064,
+      "step": 324
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.4157025875157999,
+      "learning_rate": 0.00019480091799562704,
+      "loss": 0.7476,
+      "step": 325
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.3805353828876863,
+      "learning_rate": 0.00019475961000798645,
+      "loss": 0.6952,
+      "step": 326
+    },
+    {
+      "epoch": 0.1308,
+      "grad_norm": 0.4169465960559036,
+      "learning_rate": 0.0001947181429828739,
+      "loss": 0.7243,
+      "step": 327
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3604721681368048,
+      "learning_rate": 0.00019467651698988462,
+      "loss": 0.6753,
+      "step": 328
+    },
+    {
+      "epoch": 0.1316,
+      "grad_norm": 0.36477468252574086,
+      "learning_rate": 0.0001946347320988806,
+      "loss": 0.6663,
+      "step": 329
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.4097890346629651,
+      "learning_rate": 0.00019459278837999046,
+      "loss": 0.7221,
+      "step": 330
+    },
+    {
+      "epoch": 0.1324,
+      "grad_norm": 0.39758712851129263,
+      "learning_rate": 0.00019455068590360942,
+      "loss": 0.7078,
+      "step": 331
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.3846785536341143,
+      "learning_rate": 0.00019450842474039913,
+      "loss": 0.7174,
+      "step": 332
+    },
+    {
+      "epoch": 0.1332,
+      "grad_norm": 0.38510034785734193,
+      "learning_rate": 0.00019446600496128758,
+      "loss": 0.6909,
+      "step": 333
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.40677707004711616,
+      "learning_rate": 0.00019442342663746902,
+      "loss": 0.7074,
+      "step": 334
+    },
+    {
+      "epoch": 0.134,
+      "grad_norm": 0.42544237574759275,
+      "learning_rate": 0.00019438068984040365,
+      "loss": 0.6776,
+      "step": 335
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4053693843260777,
+      "learning_rate": 0.00019433779464181778,
+      "loss": 0.6747,
+      "step": 336
+    },
+    {
+      "epoch": 0.1348,
+      "grad_norm": 0.3913371200478165,
+      "learning_rate": 0.00019429474111370352,
+      "loss": 0.6924,
+      "step": 337
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.4149006096866092,
+      "learning_rate": 0.0001942515293283187,
+      "loss": 0.6904,
+      "step": 338
+    },
+    {
+      "epoch": 0.1356,
+      "grad_norm": 0.38030074742371944,
+      "learning_rate": 0.00019420815935818672,
+      "loss": 0.6468,
+      "step": 339
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4109580631805435,
+      "learning_rate": 0.00019416463127609656,
+      "loss": 0.6809,
+      "step": 340
+    },
+    {
+      "epoch": 0.1364,
+      "grad_norm": 0.3874486823898277,
+      "learning_rate": 0.00019412094515510248,
+      "loss": 0.7066,
+      "step": 341
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.400169839799612,
+      "learning_rate": 0.00019407710106852404,
+      "loss": 0.6735,
+      "step": 342
+    },
+    {
+      "epoch": 0.1372,
+      "grad_norm": 0.4413037177842987,
+      "learning_rate": 0.00019403309908994586,
+      "loss": 0.6974,
+      "step": 343
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.3742979700025569,
+      "learning_rate": 0.00019398893929321761,
+      "loss": 0.6784,
+      "step": 344
+    },
+    {
+      "epoch": 0.138,
+      "grad_norm": 0.42359247868849575,
+      "learning_rate": 0.00019394462175245381,
+      "loss": 0.7778,
+      "step": 345
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.4234055597178886,
+      "learning_rate": 0.00019390014654203369,
+      "loss": 0.7195,
+      "step": 346
+    },
+    {
+      "epoch": 0.1388,
+      "grad_norm": 0.4005993402368827,
+      "learning_rate": 0.0001938555137366011,
+      "loss": 0.7165,
+      "step": 347
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.3816229344928684,
+      "learning_rate": 0.00019381072341106452,
+      "loss": 0.7154,
+      "step": 348
+    },
+    {
+      "epoch": 0.1396,
+      "grad_norm": 0.40684887980158285,
+      "learning_rate": 0.0001937657756405966,
+      "loss": 0.7035,
+      "step": 349
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4008381489661814,
+      "learning_rate": 0.00019372067050063438,
+      "loss": 0.7016,
+      "step": 350
+    },
+    {
+      "epoch": 0.1404,
+      "grad_norm": 0.4033847431397458,
+      "learning_rate": 0.00019367540806687893,
+      "loss": 0.7286,
+      "step": 351
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.40743931660695076,
+      "learning_rate": 0.0001936299884152954,
+      "loss": 0.7008,
+      "step": 352
+    },
+    {
+      "epoch": 0.1412,
+      "grad_norm": 0.3762833708731107,
+      "learning_rate": 0.0001935844116221127,
+      "loss": 0.6785,
+      "step": 353
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.39511133029768386,
+      "learning_rate": 0.00019353867776382354,
+      "loss": 0.7313,
+      "step": 354
+    },
+    {
+      "epoch": 0.142,
+      "grad_norm": 0.41516019571816626,
+      "learning_rate": 0.00019349278691718427,
+      "loss": 0.7304,
+      "step": 355
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4090372519670217,
+      "learning_rate": 0.0001934467391592146,
+      "loss": 0.7437,
+      "step": 356
+    },
+    {
+      "epoch": 0.1428,
+      "grad_norm": 0.367684000824685,
+      "learning_rate": 0.00019340053456719768,
+      "loss": 0.7387,
+      "step": 357
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.3634548661346829,
+      "learning_rate": 0.00019335417321867987,
+      "loss": 0.6985,
+      "step": 358
+    },
+    {
+      "epoch": 0.1436,
+      "grad_norm": 0.38248846649804163,
+      "learning_rate": 0.0001933076551914706,
+      "loss": 0.6946,
+      "step": 359
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3903837776823106,
+      "learning_rate": 0.00019326098056364222,
+      "loss": 0.738,
+      "step": 360
+    },
+    {
+      "epoch": 0.1444,
+      "grad_norm": 0.37316510939032765,
+      "learning_rate": 0.00019321414941353003,
+      "loss": 0.7023,
+      "step": 361
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.3843035531540929,
+      "learning_rate": 0.00019316716181973188,
+      "loss": 0.6897,
+      "step": 362
+    },
+    {
+      "epoch": 0.1452,
+      "grad_norm": 0.3960956773681474,
+      "learning_rate": 0.00019312001786110828,
+      "loss": 0.7433,
+      "step": 363
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4101148329592276,
+      "learning_rate": 0.00019307271761678213,
+      "loss": 0.68,
+      "step": 364
+    },
+    {
+      "epoch": 0.146,
+      "grad_norm": 0.4035799883510049,
+      "learning_rate": 0.00019302526116613864,
+      "loss": 0.7756,
+      "step": 365
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.34335587702974607,
+      "learning_rate": 0.00019297764858882514,
+      "loss": 0.6584,
+      "step": 366
+    },
+    {
+      "epoch": 0.1468,
+      "grad_norm": 0.41575729551259694,
+      "learning_rate": 0.00019292987996475113,
+      "loss": 0.7255,
+      "step": 367
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3987444219453461,
+      "learning_rate": 0.0001928819553740878,
+      "loss": 0.7267,
+      "step": 368
+    },
+    {
+      "epoch": 0.1476,
+      "grad_norm": 0.36645586098492455,
+      "learning_rate": 0.00019283387489726827,
+      "loss": 0.6392,
+      "step": 369
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.3901779305178469,
+      "learning_rate": 0.00019278563861498723,
+      "loss": 0.7727,
+      "step": 370
+    },
+    {
+      "epoch": 0.1484,
+      "grad_norm": 0.3887595088054966,
+      "learning_rate": 0.00019273724660820088,
+      "loss": 0.6535,
+      "step": 371
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.3846140376662957,
+      "learning_rate": 0.00019268869895812672,
+      "loss": 0.7625,
+      "step": 372
+    },
+    {
+      "epoch": 0.1492,
+      "grad_norm": 0.4217259319075013,
+      "learning_rate": 0.00019263999574624355,
+      "loss": 0.7694,
+      "step": 373
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.38627934512584944,
+      "learning_rate": 0.0001925911370542912,
+      "loss": 0.7557,
+      "step": 374
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.39726895858256805,
+      "learning_rate": 0.00019254212296427044,
+      "loss": 0.696,
+      "step": 375
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.39509375724648294,
+      "learning_rate": 0.00019249295355844285,
+      "loss": 0.6939,
+      "step": 376
+    },
+    {
+      "epoch": 0.1508,
+      "grad_norm": 0.39093740307759517,
+      "learning_rate": 0.00019244362891933077,
+      "loss": 0.7079,
+      "step": 377
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.3752349281466314,
+      "learning_rate": 0.00019239414912971696,
+      "loss": 0.6919,
+      "step": 378
+    },
+    {
+      "epoch": 0.1516,
+      "grad_norm": 0.3680847322213261,
+      "learning_rate": 0.0001923445142726446,
+      "loss": 0.7116,
+      "step": 379
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.37204500135886637,
+      "learning_rate": 0.0001922947244314172,
+      "loss": 0.7019,
+      "step": 380
+    },
+    {
+      "epoch": 0.1524,
+      "grad_norm": 0.40124819438119996,
+      "learning_rate": 0.0001922447796895982,
+      "loss": 0.7217,
+      "step": 381
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.3865757435112636,
+      "learning_rate": 0.00019219468013101124,
+      "loss": 0.7401,
+      "step": 382
+    },
+    {
+      "epoch": 0.1532,
+      "grad_norm": 0.37950392105185105,
+      "learning_rate": 0.00019214442583973966,
+      "loss": 0.72,
+      "step": 383
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.36805410693362434,
+      "learning_rate": 0.00019209401690012653,
+      "loss": 0.7653,
+      "step": 384
+    },
+    {
+      "epoch": 0.154,
+      "grad_norm": 0.3574885525687318,
+      "learning_rate": 0.00019204345339677442,
+      "loss": 0.7135,
+      "step": 385
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.3628661116738235,
+      "learning_rate": 0.00019199273541454538,
+      "loss": 0.6446,
+      "step": 386
+    },
+    {
+      "epoch": 0.1548,
+      "grad_norm": 0.37492897305274625,
+      "learning_rate": 0.00019194186303856067,
+      "loss": 0.698,
+      "step": 387
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.38152196770466545,
+      "learning_rate": 0.00019189083635420075,
+      "loss": 0.693,
+      "step": 388
+    },
+    {
+      "epoch": 0.1556,
+      "grad_norm": 0.38450336508215077,
+      "learning_rate": 0.00019183965544710495,
+      "loss": 0.6905,
+      "step": 389
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.3712084726550805,
+      "learning_rate": 0.00019178832040317155,
+      "loss": 0.6671,
+      "step": 390
+    },
+    {
+      "epoch": 0.1564,
+      "grad_norm": 0.3616802203123445,
+      "learning_rate": 0.0001917368313085574,
+      "loss": 0.6671,
+      "step": 391
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.3721551241618506,
+      "learning_rate": 0.00019168518824967795,
+      "loss": 0.7082,
+      "step": 392
+    },
+    {
+      "epoch": 0.1572,
+      "grad_norm": 0.3738096531629254,
+      "learning_rate": 0.00019163339131320718,
+      "loss": 0.6859,
+      "step": 393
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.3747029831223078,
+      "learning_rate": 0.00019158144058607708,
+      "loss": 0.6783,
+      "step": 394
+    },
+    {
+      "epoch": 0.158,
+      "grad_norm": 0.3643373612674067,
+      "learning_rate": 0.00019152933615547798,
+      "loss": 0.6779,
+      "step": 395
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4025633496093721,
+      "learning_rate": 0.000191477078108858,
+      "loss": 0.6875,
+      "step": 396
+    },
+    {
+      "epoch": 0.1588,
+      "grad_norm": 0.3890515795999232,
+      "learning_rate": 0.00019142466653392318,
+      "loss": 0.7375,
+      "step": 397
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.41196475648797765,
+      "learning_rate": 0.0001913721015186372,
+      "loss": 0.8219,
+      "step": 398
+    },
+    {
+      "epoch": 0.1596,
+      "grad_norm": 0.36303264191560275,
+      "learning_rate": 0.0001913193831512213,
+      "loss": 0.6539,
+      "step": 399
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3733774345130003,
+      "learning_rate": 0.00019126651152015403,
+      "loss": 0.7258,
+      "step": 400
+    },
+    {
+      "epoch": 0.1604,
+      "grad_norm": 0.3600148652565467,
+      "learning_rate": 0.0001912134867141712,
+      "loss": 0.6763,
+      "step": 401
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.4130332089907557,
+      "learning_rate": 0.0001911603088222657,
+      "loss": 0.6829,
+      "step": 402
+    },
+    {
+      "epoch": 0.1612,
+      "grad_norm": 0.37915621184314446,
+      "learning_rate": 0.0001911069779336873,
+      "loss": 0.7099,
+      "step": 403
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.3982697677981037,
+      "learning_rate": 0.00019105349413794272,
+      "loss": 0.7706,
+      "step": 404
+    },
+    {
+      "epoch": 0.162,
+      "grad_norm": 0.38458590119885516,
+      "learning_rate": 0.00019099985752479506,
+      "loss": 0.7495,
+      "step": 405
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.40139329153688263,
+      "learning_rate": 0.00019094606818426403,
+      "loss": 0.7122,
+      "step": 406
+    },
+    {
+      "epoch": 0.1628,
+      "grad_norm": 0.38138079047942747,
+      "learning_rate": 0.00019089212620662568,
+      "loss": 0.7237,
+      "step": 407
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.367840917838084,
+      "learning_rate": 0.00019083803168241223,
+      "loss": 0.7042,
+      "step": 408
+    },
+    {
+      "epoch": 0.1636,
+      "grad_norm": 0.3719462302258643,
+      "learning_rate": 0.00019078378470241183,
+      "loss": 0.7003,
+      "step": 409
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.3871663286058704,
+      "learning_rate": 0.00019072938535766865,
+      "loss": 0.699,
+      "step": 410
+    },
+    {
+      "epoch": 0.1644,
+      "grad_norm": 0.37122376588124145,
+      "learning_rate": 0.00019067483373948243,
+      "loss": 0.6396,
+      "step": 411
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.36458011191898626,
+      "learning_rate": 0.00019062012993940859,
+      "loss": 0.7197,
+      "step": 412
+    },
+    {
+      "epoch": 0.1652,
+      "grad_norm": 0.39366153882601723,
+      "learning_rate": 0.00019056527404925789,
+      "loss": 0.7104,
+      "step": 413
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.3772750003433443,
+      "learning_rate": 0.00019051026616109638,
+      "loss": 0.7374,
+      "step": 414
+    },
+    {
+      "epoch": 0.166,
+      "grad_norm": 0.3717203779567961,
+      "learning_rate": 0.0001904551063672452,
+      "loss": 0.7414,
+      "step": 415
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.366581005775702,
+      "learning_rate": 0.00019039979476028043,
+      "loss": 0.688,
+      "step": 416
+    },
+    {
+      "epoch": 0.1668,
+      "grad_norm": 0.39793014724181996,
+      "learning_rate": 0.000190344331433033,
+      "loss": 0.6706,
+      "step": 417
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.3909005122636842,
+      "learning_rate": 0.00019028871647858834,
+      "loss": 0.7149,
+      "step": 418
+    },
+    {
+      "epoch": 0.1676,
+      "grad_norm": 0.3827833190394904,
+      "learning_rate": 0.00019023294999028653,
+      "loss": 0.7067,
+      "step": 419
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.3707544804332314,
+      "learning_rate": 0.00019017703206172185,
+      "loss": 0.7368,
+      "step": 420
+    },
+    {
+      "epoch": 0.1684,
+      "grad_norm": 0.387842364505732,
+      "learning_rate": 0.0001901209627867428,
+      "loss": 0.6957,
+      "step": 421
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.3586581412742944,
+      "learning_rate": 0.0001900647422594519,
+      "loss": 0.7095,
+      "step": 422
+    },
+    {
+      "epoch": 0.1692,
+      "grad_norm": 0.3776421164866786,
+      "learning_rate": 0.0001900083705742054,
+      "loss": 0.7314,
+      "step": 423
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3518309566121195,
+      "learning_rate": 0.00018995184782561345,
+      "loss": 0.662,
+      "step": 424
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.35175767335896485,
+      "learning_rate": 0.00018989517410853955,
+      "loss": 0.6484,
+      "step": 425
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.37014079477382306,
+      "learning_rate": 0.0001898383495181007,
+      "loss": 0.6885,
+      "step": 426
+    },
+    {
+      "epoch": 0.1708,
+      "grad_norm": 0.3999729258228388,
+      "learning_rate": 0.00018978137414966698,
+      "loss": 0.6745,
+      "step": 427
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.391206313530272,
+      "learning_rate": 0.0001897242480988617,
+      "loss": 0.6987,
+      "step": 428
+    },
+    {
+      "epoch": 0.1716,
+      "grad_norm": 0.3760066653882305,
+      "learning_rate": 0.00018966697146156092,
+      "loss": 0.6594,
+      "step": 429
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.41634437705839444,
+      "learning_rate": 0.00018960954433389345,
+      "loss": 0.7287,
+      "step": 430
+    },
+    {
+      "epoch": 0.1724,
+      "grad_norm": 0.38316647013622684,
+      "learning_rate": 0.0001895519668122408,
+      "loss": 0.716,
+      "step": 431
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3859147191673041,
+      "learning_rate": 0.0001894942389932367,
+      "loss": 0.688,
+      "step": 432
+    },
+    {
+      "epoch": 0.1732,
+      "grad_norm": 0.38536780897260897,
+      "learning_rate": 0.00018943636097376726,
+      "loss": 0.6973,
+      "step": 433
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.38022975855348395,
+      "learning_rate": 0.00018937833285097066,
+      "loss": 0.6921,
+      "step": 434
+    },
+    {
+      "epoch": 0.174,
+      "grad_norm": 0.3727868538535105,
+      "learning_rate": 0.00018932015472223693,
+      "loss": 0.6858,
+      "step": 435
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.39691838283866077,
+      "learning_rate": 0.00018926182668520792,
+      "loss": 0.7093,
+      "step": 436
+    },
+    {
+      "epoch": 0.1748,
+      "grad_norm": 0.386131787817755,
+      "learning_rate": 0.0001892033488377771,
+      "loss": 0.7128,
+      "step": 437
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.3866928717931557,
+      "learning_rate": 0.0001891447212780893,
+      "loss": 0.6833,
+      "step": 438
+    },
+    {
+      "epoch": 0.1756,
+      "grad_norm": 0.37440549427127356,
+      "learning_rate": 0.0001890859441045407,
+      "loss": 0.7292,
+      "step": 439
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3966367147518759,
+      "learning_rate": 0.0001890270174157784,
+      "loss": 0.7243,
+      "step": 440
+    },
+    {
+      "epoch": 0.1764,
+      "grad_norm": 0.3725940293890487,
+      "learning_rate": 0.00018896794131070073,
+      "loss": 0.6948,
+      "step": 441
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.38919233794834746,
+      "learning_rate": 0.0001889087158884565,
+      "loss": 0.7018,
+      "step": 442
+    },
+    {
+      "epoch": 0.1772,
+      "grad_norm": 0.37745830074894476,
+      "learning_rate": 0.00018884934124844532,
+      "loss": 0.7484,
+      "step": 443
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.3854960456283892,
+      "learning_rate": 0.00018878981749031716,
+      "loss": 0.676,
+      "step": 444
+    },
+    {
+      "epoch": 0.178,
+      "grad_norm": 0.38826431295550146,
+      "learning_rate": 0.00018873014471397224,
+      "loss": 0.6926,
+      "step": 445
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.3802235682610009,
+      "learning_rate": 0.00018867032301956088,
+      "loss": 0.6411,
+      "step": 446
+    },
+    {
+      "epoch": 0.1788,
+      "grad_norm": 0.3686143691302594,
+      "learning_rate": 0.00018861035250748343,
+      "loss": 0.6984,
+      "step": 447
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.37565487988507895,
+      "learning_rate": 0.00018855023327838983,
+      "loss": 0.7024,
+      "step": 448
+    },
+    {
+      "epoch": 0.1796,
+      "grad_norm": 0.3729423273925913,
+      "learning_rate": 0.00018848996543317982,
+      "loss": 0.7043,
+      "step": 449
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.38784766614091365,
+      "learning_rate": 0.00018842954907300236,
+      "loss": 0.7266,
+      "step": 450
+    },
+    {
+      "epoch": 0.1804,
+      "grad_norm": 0.40027503237144674,
+      "learning_rate": 0.00018836898429925585,
+      "loss": 0.7562,
+      "step": 451
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.3828729125010602,
+      "learning_rate": 0.0001883082712135877,
+      "loss": 0.6473,
+      "step": 452
+    },
+    {
+      "epoch": 0.1812,
+      "grad_norm": 0.40764363051902,
+      "learning_rate": 0.00018824740991789415,
+      "loss": 0.7135,
+      "step": 453
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.3672450974926339,
+      "learning_rate": 0.00018818640051432035,
+      "loss": 0.7266,
+      "step": 454
+    },
+    {
+      "epoch": 0.182,
+      "grad_norm": 0.3638359282622377,
+      "learning_rate": 0.0001881252431052599,
+      "loss": 0.7062,
+      "step": 455
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3718509816996577,
+      "learning_rate": 0.00018806393779335483,
+      "loss": 0.6764,
+      "step": 456
+    },
+    {
+      "epoch": 0.1828,
+      "grad_norm": 0.3703550388188459,
+      "learning_rate": 0.00018800248468149543,
+      "loss": 0.6773,
+      "step": 457
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.3880381643824319,
+      "learning_rate": 0.00018794088387282,
+      "loss": 0.6608,
+      "step": 458
+    },
+    {
+      "epoch": 0.1836,
+      "grad_norm": 0.5023937375660238,
+      "learning_rate": 0.00018787913547071484,
+      "loss": 0.6839,
+      "step": 459
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.38790606135318434,
+      "learning_rate": 0.00018781723957881372,
+      "loss": 0.7163,
+      "step": 460
+    },
+    {
+      "epoch": 0.1844,
+      "grad_norm": 0.38612530797035416,
+      "learning_rate": 0.0001877551963009982,
+      "loss": 0.6792,
+      "step": 461
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.39648088910610624,
+      "learning_rate": 0.0001876930057413971,
+      "loss": 0.6931,
+      "step": 462
+    },
+    {
+      "epoch": 0.1852,
+      "grad_norm": 0.3620504076730805,
+      "learning_rate": 0.00018763066800438636,
+      "loss": 0.7105,
+      "step": 463
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.381143525075452,
+      "learning_rate": 0.00018756818319458907,
+      "loss": 0.7143,
+      "step": 464
+    },
+    {
+      "epoch": 0.186,
+      "grad_norm": 0.39980114683333307,
+      "learning_rate": 0.000187505551416875,
+      "loss": 0.6536,
+      "step": 465
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.36845926955292746,
+      "learning_rate": 0.0001874427727763607,
+      "loss": 0.7263,
+      "step": 466
+    },
+    {
+      "epoch": 0.1868,
+      "grad_norm": 0.38586647673684304,
+      "learning_rate": 0.0001873798473784092,
+      "loss": 0.6942,
+      "step": 467
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.39590597916453313,
+      "learning_rate": 0.00018731677532862976,
+      "loss": 0.6978,
+      "step": 468
+    },
+    {
+      "epoch": 0.1876,
+      "grad_norm": 0.3612337720822994,
+      "learning_rate": 0.00018725355673287778,
+      "loss": 0.6745,
+      "step": 469
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.3977639592238108,
+      "learning_rate": 0.00018719019169725472,
+      "loss": 0.6738,
+      "step": 470
+    },
+    {
+      "epoch": 0.1884,
+      "grad_norm": 0.3718333358835038,
+      "learning_rate": 0.00018712668032810768,
+      "loss": 0.6761,
+      "step": 471
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3836908486345542,
+      "learning_rate": 0.00018706302273202943,
+      "loss": 0.6954,
+      "step": 472
+    },
+    {
+      "epoch": 0.1892,
+      "grad_norm": 0.36142278549101325,
+      "learning_rate": 0.00018699921901585813,
+      "loss": 0.6762,
+      "step": 473
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.37599764474175124,
+      "learning_rate": 0.0001869352692866772,
+      "loss": 0.7131,
+      "step": 474
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.37048262581086955,
+      "learning_rate": 0.00018687117365181512,
+      "loss": 0.6972,
+      "step": 475
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.3600691572898928,
+      "learning_rate": 0.00018680693221884517,
+      "loss": 0.7422,
+      "step": 476
+    },
+    {
+      "epoch": 0.1908,
+      "grad_norm": 0.3818288691556821,
+      "learning_rate": 0.00018674254509558544,
+      "loss": 0.7011,
+      "step": 477
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.3674052799125421,
+      "learning_rate": 0.00018667801239009846,
+      "loss": 0.705,
+      "step": 478
+    },
+    {
+      "epoch": 0.1916,
+      "grad_norm": 0.41930911669276566,
+      "learning_rate": 0.00018661333421069113,
+      "loss": 0.718,
+      "step": 479
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3913531615832862,
+      "learning_rate": 0.00018654851066591448,
+      "loss": 0.7438,
+      "step": 480
+    },
+    {
+      "epoch": 0.1924,
+      "grad_norm": 0.3557742897729816,
+      "learning_rate": 0.00018648354186456348,
+      "loss": 0.669,
+      "step": 481
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.3731384423646767,
+      "learning_rate": 0.000186418427915677,
+      "loss": 0.6502,
+      "step": 482
+    },
+    {
+      "epoch": 0.1932,
+      "grad_norm": 0.3867524755105473,
+      "learning_rate": 0.00018635316892853741,
+      "loss": 0.6892,
+      "step": 483
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.3650916952976356,
+      "learning_rate": 0.00018628776501267052,
+      "loss": 0.7087,
+      "step": 484
+    },
+    {
+      "epoch": 0.194,
+      "grad_norm": 0.3624557636642345,
+      "learning_rate": 0.0001862222162778454,
+      "loss": 0.6799,
+      "step": 485
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.3570561492880132,
+      "learning_rate": 0.0001861565228340742,
+      "loss": 0.7019,
+      "step": 486
+    },
+    {
+      "epoch": 0.1948,
+      "grad_norm": 0.37587842187370224,
+      "learning_rate": 0.00018609068479161182,
+      "loss": 0.692,
+      "step": 487
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3778178257632083,
+      "learning_rate": 0.00018602470226095603,
+      "loss": 0.7456,
+      "step": 488
+    },
+    {
+      "epoch": 0.1956,
+      "grad_norm": 0.37179725458621526,
+      "learning_rate": 0.00018595857535284692,
+      "loss": 0.6848,
+      "step": 489
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.40301659971709364,
+      "learning_rate": 0.00018589230417826697,
+      "loss": 0.7707,
+      "step": 490
+    },
+    {
+      "epoch": 0.1964,
+      "grad_norm": 0.3812236492427729,
+      "learning_rate": 0.00018582588884844084,
+      "loss": 0.6561,
+      "step": 491
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.36485508331427363,
+      "learning_rate": 0.00018575932947483502,
+      "loss": 0.6637,
+      "step": 492
+    },
+    {
+      "epoch": 0.1972,
+      "grad_norm": 0.3655147118213809,
+      "learning_rate": 0.00018569262616915784,
+      "loss": 0.6627,
+      "step": 493
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.35720316681945796,
+      "learning_rate": 0.00018562577904335912,
+      "loss": 0.6774,
+      "step": 494
+    },
+    {
+      "epoch": 0.198,
+      "grad_norm": 0.3662569867994764,
+      "learning_rate": 0.00018555878820963013,
+      "loss": 0.6054,
+      "step": 495
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3840820833030506,
+      "learning_rate": 0.00018549165378040327,
+      "loss": 0.6556,
+      "step": 496
+    },
+    {
+      "epoch": 0.1988,
+      "grad_norm": 0.3896881309740263,
+      "learning_rate": 0.00018542437586835202,
+      "loss": 0.7036,
+      "step": 497
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.3948463129162245,
+      "learning_rate": 0.00018535695458639056,
+      "loss": 0.708,
+      "step": 498
+    },
+    {
+      "epoch": 0.1996,
+      "grad_norm": 0.41169778255777534,
+      "learning_rate": 0.00018528939004767376,
+      "loss": 0.715,
+      "step": 499
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.3879039540215038,
+      "learning_rate": 0.00018522168236559695,
+      "loss": 0.7172,
+      "step": 500
+    },
+    {
+      "epoch": 0.2004,
+      "grad_norm": 0.355510555850886,
+      "learning_rate": 0.0001851538316537956,
+      "loss": 0.6596,
+      "step": 501
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.3610419600173354,
+      "learning_rate": 0.0001850858380261453,
+      "loss": 0.6548,
+      "step": 502
+    },
+    {
+      "epoch": 0.2012,
+      "grad_norm": 0.4072826093126328,
+      "learning_rate": 0.00018501770159676156,
+      "loss": 0.7069,
+      "step": 503
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3734772225484452,
+      "learning_rate": 0.0001849494224799994,
+      "loss": 0.7277,
+      "step": 504
+    },
+    {
+      "epoch": 0.202,
+      "grad_norm": 0.3954106111018613,
+      "learning_rate": 0.00018488100079045344,
+      "loss": 0.7542,
+      "step": 505
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.36858612321909856,
+      "learning_rate": 0.0001848124366429576,
+      "loss": 0.6988,
+      "step": 506
+    },
+    {
+      "epoch": 0.2028,
+      "grad_norm": 0.4026041820405598,
+      "learning_rate": 0.00018474373015258473,
+      "loss": 0.7183,
+      "step": 507
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.3747512856185188,
+      "learning_rate": 0.0001846748814346468,
+      "loss": 0.6292,
+      "step": 508
+    },
+    {
+      "epoch": 0.2036,
+      "grad_norm": 0.40646861586250593,
+      "learning_rate": 0.00018460589060469425,
+      "loss": 0.7361,
+      "step": 509
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.3712373168619134,
+      "learning_rate": 0.00018453675777851627,
+      "loss": 0.6965,
+      "step": 510
+    },
+    {
+      "epoch": 0.2044,
+      "grad_norm": 0.3822153730339224,
+      "learning_rate": 0.00018446748307214019,
+      "loss": 0.7495,
+      "step": 511
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.37312144201226494,
+      "learning_rate": 0.0001843980666018315,
+      "loss": 0.705,
+      "step": 512
+    },
+    {
+      "epoch": 0.2052,
+      "grad_norm": 0.385950831839263,
+      "learning_rate": 0.00018432850848409363,
+      "loss": 0.7575,
+      "step": 513
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.3741140719848538,
+      "learning_rate": 0.00018425880883566782,
+      "loss": 0.7222,
+      "step": 514
+    },
+    {
+      "epoch": 0.206,
+      "grad_norm": 0.3972815278946878,
+      "learning_rate": 0.0001841889677735327,
+      "loss": 0.7346,
+      "step": 515
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.38212963630442465,
+      "learning_rate": 0.00018411898541490434,
+      "loss": 0.6665,
+      "step": 516
+    },
+    {
+      "epoch": 0.2068,
+      "grad_norm": 0.3825583560569224,
+      "learning_rate": 0.0001840488618772359,
+      "loss": 0.6931,
+      "step": 517
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.39728934202351796,
+      "learning_rate": 0.00018397859727821748,
+      "loss": 0.7092,
+      "step": 518
+    },
+    {
+      "epoch": 0.2076,
+      "grad_norm": 0.40555236197334826,
+      "learning_rate": 0.00018390819173577598,
+      "loss": 0.6769,
+      "step": 519
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5609157488299549,
+      "learning_rate": 0.00018383764536807485,
+      "loss": 0.6946,
+      "step": 520
+    },
+    {
+      "epoch": 0.2084,
+      "grad_norm": 0.38043954785026474,
+      "learning_rate": 0.00018376695829351377,
+      "loss": 0.6994,
+      "step": 521
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.3716156223121875,
+      "learning_rate": 0.00018369613063072874,
+      "loss": 0.6695,
+      "step": 522
+    },
+    {
+      "epoch": 0.2092,
+      "grad_norm": 0.38068514825235367,
+      "learning_rate": 0.00018362516249859163,
+      "loss": 0.7272,
+      "step": 523
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.3754330491889234,
+      "learning_rate": 0.00018355405401621001,
+      "loss": 0.6926,
+      "step": 524
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.3999675352290636,
+      "learning_rate": 0.00018348280530292713,
+      "loss": 0.716,
+      "step": 525
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.37528982607451183,
+      "learning_rate": 0.00018341141647832147,
+      "loss": 0.6607,
+      "step": 526
+    },
+    {
+      "epoch": 0.2108,
+      "grad_norm": 0.39089716567979277,
+      "learning_rate": 0.00018333988766220676,
+      "loss": 0.6711,
+      "step": 527
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3658466970924529,
+      "learning_rate": 0.0001832682189746316,
+      "loss": 0.7213,
+      "step": 528
+    },
+    {
+      "epoch": 0.2116,
+      "grad_norm": 0.36261304242450976,
+      "learning_rate": 0.00018319641053587938,
+      "loss": 0.7012,
+      "step": 529
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.3702824148173612,
+      "learning_rate": 0.0001831244624664681,
+      "loss": 0.7397,
+      "step": 530
+    },
+    {
+      "epoch": 0.2124,
+      "grad_norm": 0.3762171911426314,
+      "learning_rate": 0.00018305237488714995,
+      "loss": 0.6978,
+      "step": 531
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.3416142482473879,
+      "learning_rate": 0.00018298014791891137,
+      "loss": 0.6812,
+      "step": 532
+    },
+    {
+      "epoch": 0.2132,
+      "grad_norm": 0.3611274469188154,
+      "learning_rate": 0.00018290778168297277,
+      "loss": 0.7329,
+      "step": 533
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.3757944321097854,
+      "learning_rate": 0.00018283527630078825,
+      "loss": 0.6581,
+      "step": 534
+    },
+    {
+      "epoch": 0.214,
+      "grad_norm": 0.35800642170913016,
+      "learning_rate": 0.0001827626318940454,
+      "loss": 0.6836,
+      "step": 535
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.38446712295634733,
+      "learning_rate": 0.00018268984858466522,
+      "loss": 0.6543,
+      "step": 536
+    },
+    {
+      "epoch": 0.2148,
+      "grad_norm": 0.4016318412652301,
+      "learning_rate": 0.00018261692649480175,
+      "loss": 0.7244,
+      "step": 537
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.3809971119904624,
+      "learning_rate": 0.00018254386574684204,
+      "loss": 0.7082,
+      "step": 538
+    },
+    {
+      "epoch": 0.2156,
+      "grad_norm": 0.42070210464610197,
+      "learning_rate": 0.0001824706664634058,
+      "loss": 0.698,
+      "step": 539
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.3654952949633688,
+      "learning_rate": 0.00018239732876734527,
+      "loss": 0.639,
+      "step": 540
+    },
+    {
+      "epoch": 0.2164,
+      "grad_norm": 0.3815611739066012,
+      "learning_rate": 0.0001823238527817449,
+      "loss": 0.7342,
+      "step": 541
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.3883767391244725,
+      "learning_rate": 0.00018225023862992142,
+      "loss": 0.7583,
+      "step": 542
+    },
+    {
+      "epoch": 0.2172,
+      "grad_norm": 0.3824779257270353,
+      "learning_rate": 0.00018217648643542323,
+      "loss": 0.6867,
+      "step": 543
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3607319379743882,
+      "learning_rate": 0.0001821025963220306,
+      "loss": 0.6877,
+      "step": 544
+    },
+    {
+      "epoch": 0.218,
+      "grad_norm": 0.3860102243656817,
+      "learning_rate": 0.00018202856841375518,
+      "loss": 0.7322,
+      "step": 545
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.35714157126866863,
+      "learning_rate": 0.00018195440283483988,
+      "loss": 0.6767,
+      "step": 546
+    },
+    {
+      "epoch": 0.2188,
+      "grad_norm": 0.3824467030729281,
+      "learning_rate": 0.0001818800997097587,
+      "loss": 0.7056,
+      "step": 547
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.40057749915303886,
+      "learning_rate": 0.00018180565916321647,
+      "loss": 0.6617,
+      "step": 548
+    },
+    {
+      "epoch": 0.2196,
+      "grad_norm": 0.38911454764153103,
+      "learning_rate": 0.0001817310813201486,
+      "loss": 0.7121,
+      "step": 549
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.3830635611411421,
+      "learning_rate": 0.0001816563663057211,
+      "loss": 0.6557,
+      "step": 550
+    },
+    {
+      "epoch": 0.2204,
+      "grad_norm": 0.36468480046935936,
+      "learning_rate": 0.00018158151424533002,
+      "loss": 0.6871,
+      "step": 551
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3668031521515548,
+      "learning_rate": 0.00018150652526460146,
+      "loss": 0.6871,
+      "step": 552
+    },
+    {
+      "epoch": 0.2212,
+      "grad_norm": 0.3768692147099381,
+      "learning_rate": 0.00018143139948939137,
+      "loss": 0.669,
+      "step": 553
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.3832613262529067,
+      "learning_rate": 0.00018135613704578526,
+      "loss": 0.6778,
+      "step": 554
+    },
+    {
+      "epoch": 0.222,
+      "grad_norm": 0.3803908262267479,
+      "learning_rate": 0.000181280738060098,
+      "loss": 0.7378,
+      "step": 555
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.3662989151567502,
+      "learning_rate": 0.00018120520265887363,
+      "loss": 0.7141,
+      "step": 556
+    },
+    {
+      "epoch": 0.2228,
+      "grad_norm": 0.35040922293491916,
+      "learning_rate": 0.00018112953096888516,
+      "loss": 0.6102,
+      "step": 557
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.3674158329615544,
+      "learning_rate": 0.00018105372311713432,
+      "loss": 0.7266,
+      "step": 558
+    },
+    {
+      "epoch": 0.2236,
+      "grad_norm": 0.3616944009768484,
+      "learning_rate": 0.0001809777792308513,
+      "loss": 0.6999,
+      "step": 559
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.36693103145374456,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.6683,
+      "step": 560
+    },
+    {
+      "epoch": 0.2244,
+      "grad_norm": 0.36337409937433873,
+      "learning_rate": 0.0001808254838647513,
+      "loss": 0.648,
+      "step": 561
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.38399518685713585,
+      "learning_rate": 0.00018074913264053545,
+      "loss": 0.7389,
+      "step": 562
+    },
+    {
+      "epoch": 0.2252,
+      "grad_norm": 0.37042175198125565,
+      "learning_rate": 0.00018067264589298945,
+      "loss": 0.6548,
+      "step": 563
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.37049712923697325,
+      "learning_rate": 0.00018059602375048293,
+      "loss": 0.7153,
+      "step": 564
+    },
+    {
+      "epoch": 0.226,
+      "grad_norm": 0.3801771827533767,
+      "learning_rate": 0.00018051926634161282,
+      "loss": 0.6598,
+      "step": 565
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.38212696328862,
+      "learning_rate": 0.00018044237379520305,
+      "loss": 0.6387,
+      "step": 566
+    },
+    {
+      "epoch": 0.2268,
+      "grad_norm": 0.3613484282834108,
+      "learning_rate": 0.0001803653462403043,
+      "loss": 0.6351,
+      "step": 567
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3685052769200213,
+      "learning_rate": 0.0001802881838061939,
+      "loss": 0.5813,
+      "step": 568
+    },
+    {
+      "epoch": 0.2276,
+      "grad_norm": 0.38269798265593397,
+      "learning_rate": 0.00018021088662237552,
+      "loss": 0.7184,
+      "step": 569
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.3706152341283996,
+      "learning_rate": 0.00018013345481857903,
+      "loss": 0.6913,
+      "step": 570
+    },
+    {
+      "epoch": 0.2284,
+      "grad_norm": 0.35506592800145376,
+      "learning_rate": 0.00018005588852476015,
+      "loss": 0.6613,
+      "step": 571
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.3757597285473621,
+      "learning_rate": 0.00017997818787110042,
+      "loss": 0.6994,
+      "step": 572
+    },
+    {
+      "epoch": 0.2292,
+      "grad_norm": 0.3818200309145151,
+      "learning_rate": 0.0001799003529880068,
+      "loss": 0.696,
+      "step": 573
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.36612758373419035,
+      "learning_rate": 0.0001798223840061116,
+      "loss": 0.6853,
+      "step": 574
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.3613173490452758,
+      "learning_rate": 0.00017974428105627208,
+      "loss": 0.7036,
+      "step": 575
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3891757919963155,
+      "learning_rate": 0.00017966604426957047,
+      "loss": 0.7426,
+      "step": 576
+    },
+    {
+      "epoch": 0.2308,
+      "grad_norm": 0.3576378134002283,
+      "learning_rate": 0.00017958767377731358,
+      "loss": 0.6839,
+      "step": 577
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.3526132751603115,
+      "learning_rate": 0.00017950916971103259,
+      "loss": 0.6689,
+      "step": 578
+    },
+    {
+      "epoch": 0.2316,
+      "grad_norm": 0.35281699044640497,
+      "learning_rate": 0.00017943053220248283,
+      "loss": 0.6669,
+      "step": 579
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.38185895530607183,
+      "learning_rate": 0.0001793517613836437,
+      "loss": 0.6896,
+      "step": 580
+    },
+    {
+      "epoch": 0.2324,
+      "grad_norm": 0.3808102528089049,
+      "learning_rate": 0.00017927285738671825,
+      "loss": 0.7001,
+      "step": 581
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.40342552440094603,
+      "learning_rate": 0.00017919382034413305,
+      "loss": 0.7306,
+      "step": 582
+    },
+    {
+      "epoch": 0.2332,
+      "grad_norm": 0.3927131868324498,
+      "learning_rate": 0.00017911465038853805,
+      "loss": 0.6665,
+      "step": 583
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3778722392589643,
+      "learning_rate": 0.00017903534765280614,
+      "loss": 0.6318,
+      "step": 584
+    },
+    {
+      "epoch": 0.234,
+      "grad_norm": 0.4151287852900532,
+      "learning_rate": 0.00017895591227003315,
+      "loss": 0.7491,
+      "step": 585
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.4127180502215466,
+      "learning_rate": 0.00017887634437353754,
+      "loss": 0.7258,
+      "step": 586
+    },
+    {
+      "epoch": 0.2348,
+      "grad_norm": 0.3760329520227691,
+      "learning_rate": 0.00017879664409686008,
+      "loss": 0.6348,
+      "step": 587
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.38209501400968743,
+      "learning_rate": 0.00017871681157376383,
+      "loss": 0.7147,
+      "step": 588
+    },
+    {
+      "epoch": 0.2356,
+      "grad_norm": 0.3847708905853376,
+      "learning_rate": 0.00017863684693823374,
+      "loss": 0.6319,
+      "step": 589
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.3840268112970994,
+      "learning_rate": 0.00017855675032447648,
+      "loss": 0.7028,
+      "step": 590
+    },
+    {
+      "epoch": 0.2364,
+      "grad_norm": 0.36321968809268107,
+      "learning_rate": 0.00017847652186692026,
+      "loss": 0.6773,
+      "step": 591
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3889401405169338,
+      "learning_rate": 0.00017839616170021452,
+      "loss": 0.6702,
+      "step": 592
+    },
+    {
+      "epoch": 0.2372,
+      "grad_norm": 0.36552397912944695,
+      "learning_rate": 0.00017831566995922985,
+      "loss": 0.6149,
+      "step": 593
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.41483290567522496,
+      "learning_rate": 0.0001782350467790575,
+      "loss": 0.722,
+      "step": 594
+    },
+    {
+      "epoch": 0.238,
+      "grad_norm": 0.37673867043341613,
+      "learning_rate": 0.00017815429229500946,
+      "loss": 0.6631,
+      "step": 595
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.400236400217791,
+      "learning_rate": 0.00017807340664261802,
+      "loss": 0.7343,
+      "step": 596
+    },
+    {
+      "epoch": 0.2388,
+      "grad_norm": 0.3872262889336039,
+      "learning_rate": 0.00017799238995763568,
+      "loss": 0.6667,
+      "step": 597
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.36611894737878553,
+      "learning_rate": 0.00017791124237603477,
+      "loss": 0.6985,
+      "step": 598
+    },
+    {
+      "epoch": 0.2396,
+      "grad_norm": 0.3794928656592404,
+      "learning_rate": 0.00017782996403400736,
+      "loss": 0.6649,
+      "step": 599
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3737190197254289,
+      "learning_rate": 0.00017774855506796496,
+      "loss": 0.6962,
+      "step": 600
+    },
+    {
+      "epoch": 0.2404,
+      "grad_norm": 0.39463729811356335,
+      "learning_rate": 0.0001776670156145383,
+      "loss": 0.7304,
+      "step": 601
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.3831388487338996,
+      "learning_rate": 0.00017758534581057718,
+      "loss": 0.6629,
+      "step": 602
+    },
+    {
+      "epoch": 0.2412,
+      "grad_norm": 0.43051349105935655,
+      "learning_rate": 0.00017750354579315004,
+      "loss": 0.6777,
+      "step": 603
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.3815835573125915,
+      "learning_rate": 0.00017742161569954398,
+      "loss": 0.7249,
+      "step": 604
+    },
+    {
+      "epoch": 0.242,
+      "grad_norm": 0.40401973764275717,
+      "learning_rate": 0.0001773395556672644,
+      "loss": 0.6964,
+      "step": 605
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.3759793497162433,
+      "learning_rate": 0.0001772573658340347,
+      "loss": 0.6788,
+      "step": 606
+    },
+    {
+      "epoch": 0.2428,
+      "grad_norm": 0.362762388517486,
+      "learning_rate": 0.0001771750463377962,
+      "loss": 0.6848,
+      "step": 607
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.37107964853497377,
+      "learning_rate": 0.00017709259731670774,
+      "loss": 0.6923,
+      "step": 608
+    },
+    {
+      "epoch": 0.2436,
+      "grad_norm": 0.37303727524487695,
+      "learning_rate": 0.00017701001890914572,
+      "loss": 0.6886,
+      "step": 609
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.3748572413204125,
+      "learning_rate": 0.00017692731125370354,
+      "loss": 0.6442,
+      "step": 610
+    },
+    {
+      "epoch": 0.2444,
+      "grad_norm": 0.3871237167689098,
+      "learning_rate": 0.00017684447448919154,
+      "loss": 0.7179,
+      "step": 611
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3852499457759282,
+      "learning_rate": 0.00017676150875463686,
+      "loss": 0.6639,
+      "step": 612
+    },
+    {
+      "epoch": 0.2452,
+      "grad_norm": 0.39892523319546336,
+      "learning_rate": 0.0001766784141892829,
+      "loss": 0.6848,
+      "step": 613
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.38582941181479236,
+      "learning_rate": 0.0001765951909325895,
+      "loss": 0.7209,
+      "step": 614
+    },
+    {
+      "epoch": 0.246,
+      "grad_norm": 0.3693773371571141,
+      "learning_rate": 0.00017651183912423228,
+      "loss": 0.6578,
+      "step": 615
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.41863131309102863,
+      "learning_rate": 0.0001764283589041028,
+      "loss": 0.6928,
+      "step": 616
+    },
+    {
+      "epoch": 0.2468,
+      "grad_norm": 0.3729907993552498,
+      "learning_rate": 0.00017634475041230797,
+      "loss": 0.6805,
+      "step": 617
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.3652078874991456,
+      "learning_rate": 0.00017626101378917004,
+      "loss": 0.6434,
+      "step": 618
+    },
+    {
+      "epoch": 0.2476,
+      "grad_norm": 0.3841317724190862,
+      "learning_rate": 0.0001761771491752264,
+      "loss": 0.6609,
+      "step": 619
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.38851731747912505,
+      "learning_rate": 0.0001760931567112291,
+      "loss": 0.7074,
+      "step": 620
+    },
+    {
+      "epoch": 0.2484,
+      "grad_norm": 0.370551859571575,
+      "learning_rate": 0.0001760090365381449,
+      "loss": 0.6655,
+      "step": 621
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.35499415306837123,
+      "learning_rate": 0.0001759247887971548,
+      "loss": 0.6747,
+      "step": 622
+    },
+    {
+      "epoch": 0.2492,
+      "grad_norm": 0.3845099556724429,
+      "learning_rate": 0.00017584041362965396,
+      "loss": 0.7022,
+      "step": 623
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.37613503730669373,
+      "learning_rate": 0.0001757559111772513,
+      "loss": 0.6742,
+      "step": 624
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.3684496070745272,
+      "learning_rate": 0.00017567128158176953,
+      "loss": 0.6842,
+      "step": 625
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.3888794211882254,
+      "learning_rate": 0.0001755865249852446,
+      "loss": 0.6807,
+      "step": 626
+    },
+    {
+      "epoch": 0.2508,
+      "grad_norm": 0.40567984878124275,
+      "learning_rate": 0.00017550164152992573,
+      "loss": 0.7102,
+      "step": 627
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4117379538927731,
+      "learning_rate": 0.00017541663135827492,
+      "loss": 0.6682,
+      "step": 628
+    },
+    {
+      "epoch": 0.2516,
+      "grad_norm": 0.3966339407445109,
+      "learning_rate": 0.000175331494612967,
+      "loss": 0.7165,
+      "step": 629
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.3849902084616848,
+      "learning_rate": 0.00017524623143688902,
+      "loss": 0.674,
+      "step": 630
+    },
+    {
+      "epoch": 0.2524,
+      "grad_norm": 0.38338662340864577,
+      "learning_rate": 0.00017516084197314046,
+      "loss": 0.705,
+      "step": 631
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.36860128784352547,
+      "learning_rate": 0.00017507532636503256,
+      "loss": 0.7055,
+      "step": 632
+    },
+    {
+      "epoch": 0.2532,
+      "grad_norm": 0.3640341887601648,
+      "learning_rate": 0.00017498968475608838,
+      "loss": 0.6785,
+      "step": 633
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.3702796399289806,
+      "learning_rate": 0.00017490391729004244,
+      "loss": 0.6571,
+      "step": 634
+    },
+    {
+      "epoch": 0.254,
+      "grad_norm": 0.3982786533867165,
+      "learning_rate": 0.00017481802411084042,
+      "loss": 0.6869,
+      "step": 635
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.3913728183615705,
+      "learning_rate": 0.00017473200536263905,
+      "loss": 0.6273,
+      "step": 636
+    },
+    {
+      "epoch": 0.2548,
+      "grad_norm": 0.3644492049830817,
+      "learning_rate": 0.0001746458611898058,
+      "loss": 0.6912,
+      "step": 637
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.3850145886141273,
+      "learning_rate": 0.00017455959173691863,
+      "loss": 0.698,
+      "step": 638
+    },
+    {
+      "epoch": 0.2556,
+      "grad_norm": 0.4037588418859427,
+      "learning_rate": 0.00017447319714876579,
+      "loss": 0.6915,
+      "step": 639
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.38514181666674996,
+      "learning_rate": 0.00017438667757034546,
+      "loss": 0.7011,
+      "step": 640
+    },
+    {
+      "epoch": 0.2564,
+      "grad_norm": 0.3804264348600396,
+      "learning_rate": 0.00017430003314686569,
+      "loss": 0.6562,
+      "step": 641
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.36701716408505514,
+      "learning_rate": 0.00017421326402374405,
+      "loss": 0.6247,
+      "step": 642
+    },
+    {
+      "epoch": 0.2572,
+      "grad_norm": 0.3791261640547457,
+      "learning_rate": 0.00017412637034660734,
+      "loss": 0.6627,
+      "step": 643
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.3590754847058719,
+      "learning_rate": 0.0001740393522612915,
+      "loss": 0.6937,
+      "step": 644
+    },
+    {
+      "epoch": 0.258,
+      "grad_norm": 0.35700417054100436,
+      "learning_rate": 0.0001739522099138411,
+      "loss": 0.7102,
+      "step": 645
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.3617050874886994,
+      "learning_rate": 0.00017386494345050942,
+      "loss": 0.7027,
+      "step": 646
+    },
+    {
+      "epoch": 0.2588,
+      "grad_norm": 0.3646321617837386,
+      "learning_rate": 0.000173777553017758,
+      "loss": 0.6648,
+      "step": 647
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3625516760201923,
+      "learning_rate": 0.00017369003876225642,
+      "loss": 0.7316,
+      "step": 648
+    },
+    {
+      "epoch": 0.2596,
+      "grad_norm": 0.37263329756150554,
+      "learning_rate": 0.00017360240083088213,
+      "loss": 0.6924,
+      "step": 649
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.376611064425148,
+      "learning_rate": 0.00017351463937072004,
+      "loss": 0.7019,
+      "step": 650
+    },
+    {
+      "epoch": 0.2604,
+      "grad_norm": 0.36198702197838944,
+      "learning_rate": 0.00017342675452906248,
+      "loss": 0.661,
+      "step": 651
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.3765301050003338,
+      "learning_rate": 0.00017333874645340884,
+      "loss": 0.6803,
+      "step": 652
+    },
+    {
+      "epoch": 0.2612,
+      "grad_norm": 0.369200244627375,
+      "learning_rate": 0.0001732506152914653,
+      "loss": 0.6466,
+      "step": 653
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.3707798765929919,
+      "learning_rate": 0.00017316236119114463,
+      "loss": 0.6924,
+      "step": 654
+    },
+    {
+      "epoch": 0.262,
+      "grad_norm": 0.35393038650976316,
+      "learning_rate": 0.00017307398430056593,
+      "loss": 0.6453,
+      "step": 655
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3643741949790214,
+      "learning_rate": 0.00017298548476805446,
+      "loss": 0.6577,
+      "step": 656
+    },
+    {
+      "epoch": 0.2628,
+      "grad_norm": 0.3953980293463085,
+      "learning_rate": 0.00017289686274214118,
+      "loss": 0.7378,
+      "step": 657
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.37223081584921586,
+      "learning_rate": 0.00017280811837156268,
+      "loss": 0.6593,
+      "step": 658
+    },
+    {
+      "epoch": 0.2636,
+      "grad_norm": 0.38670120610896946,
+      "learning_rate": 0.00017271925180526094,
+      "loss": 0.6253,
+      "step": 659
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.37613165176707375,
+      "learning_rate": 0.00017263026319238301,
+      "loss": 0.6898,
+      "step": 660
+    },
+    {
+      "epoch": 0.2644,
+      "grad_norm": 0.369258827355013,
+      "learning_rate": 0.0001725411526822807,
+      "loss": 0.6723,
+      "step": 661
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.35725508206210926,
+      "learning_rate": 0.0001724519204245105,
+      "loss": 0.6454,
+      "step": 662
+    },
+    {
+      "epoch": 0.2652,
+      "grad_norm": 0.36228632485926626,
+      "learning_rate": 0.0001723625665688331,
+      "loss": 0.6724,
+      "step": 663
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3716104052474549,
+      "learning_rate": 0.00017227309126521348,
+      "loss": 0.7132,
+      "step": 664
+    },
+    {
+      "epoch": 0.266,
+      "grad_norm": 0.3785359212197475,
+      "learning_rate": 0.00017218349466382023,
+      "loss": 0.6516,
+      "step": 665
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.40494771256617085,
+      "learning_rate": 0.00017209377691502565,
+      "loss": 0.7157,
+      "step": 666
+    },
+    {
+      "epoch": 0.2668,
+      "grad_norm": 0.387566905753395,
+      "learning_rate": 0.0001720039381694053,
+      "loss": 0.6663,
+      "step": 667
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4072399001753149,
+      "learning_rate": 0.00017191397857773788,
+      "loss": 0.6819,
+      "step": 668
+    },
+    {
+      "epoch": 0.2676,
+      "grad_norm": 0.35836036729376525,
+      "learning_rate": 0.00017182389829100485,
+      "loss": 0.7076,
+      "step": 669
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.3724433995870969,
+      "learning_rate": 0.00017173369746039025,
+      "loss": 0.6463,
+      "step": 670
+    },
+    {
+      "epoch": 0.2684,
+      "grad_norm": 0.37686765784544657,
+      "learning_rate": 0.00017164337623728045,
+      "loss": 0.7049,
+      "step": 671
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3873353017767109,
+      "learning_rate": 0.00017155293477326384,
+      "loss": 0.7006,
+      "step": 672
+    },
+    {
+      "epoch": 0.2692,
+      "grad_norm": 0.37233428622257264,
+      "learning_rate": 0.00017146237322013068,
+      "loss": 0.6929,
+      "step": 673
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.37389546581786853,
+      "learning_rate": 0.00017137169172987268,
+      "loss": 0.6666,
+      "step": 674
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.3468004782863812,
+      "learning_rate": 0.00017128089045468294,
+      "loss": 0.6896,
+      "step": 675
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.3776082125284464,
+      "learning_rate": 0.00017118996954695553,
+      "loss": 0.6808,
+      "step": 676
+    },
+    {
+      "epoch": 0.2708,
+      "grad_norm": 0.3614927088426095,
+      "learning_rate": 0.00017109892915928535,
+      "loss": 0.6954,
+      "step": 677
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.4890730822409624,
+      "learning_rate": 0.00017100776944446781,
+      "loss": 0.7149,
+      "step": 678
+    },
+    {
+      "epoch": 0.2716,
+      "grad_norm": 0.35165166072488435,
+      "learning_rate": 0.00017091649055549855,
+      "loss": 0.676,
+      "step": 679
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3568818029823244,
+      "learning_rate": 0.0001708250926455733,
+      "loss": 0.7469,
+      "step": 680
+    },
+    {
+      "epoch": 0.2724,
+      "grad_norm": 0.350578974990701,
+      "learning_rate": 0.00017073357586808752,
+      "loss": 0.6526,
+      "step": 681
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.38843902431385663,
+      "learning_rate": 0.0001706419403766361,
+      "loss": 0.7067,
+      "step": 682
+    },
+    {
+      "epoch": 0.2732,
+      "grad_norm": 0.3842213506736467,
+      "learning_rate": 0.00017055018632501325,
+      "loss": 0.6636,
+      "step": 683
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.40072118060337003,
+      "learning_rate": 0.00017045831386721213,
+      "loss": 0.6932,
+      "step": 684
+    },
+    {
+      "epoch": 0.274,
+      "grad_norm": 0.3612018078843182,
+      "learning_rate": 0.00017036632315742462,
+      "loss": 0.6805,
+      "step": 685
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.34390762452297413,
+      "learning_rate": 0.00017027421435004112,
+      "loss": 0.6856,
+      "step": 686
+    },
+    {
+      "epoch": 0.2748,
+      "grad_norm": 0.41820407520924147,
+      "learning_rate": 0.00017018198759965016,
+      "loss": 0.665,
+      "step": 687
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4158476097941947,
+      "learning_rate": 0.00017008964306103823,
+      "loss": 0.5753,
+      "step": 688
+    },
+    {
+      "epoch": 0.2756,
+      "grad_norm": 0.3693476703813386,
+      "learning_rate": 0.00016999718088918955,
+      "loss": 0.6863,
+      "step": 689
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.3605022683275531,
+      "learning_rate": 0.00016990460123928575,
+      "loss": 0.6355,
+      "step": 690
+    },
+    {
+      "epoch": 0.2764,
+      "grad_norm": 0.35584585773469385,
+      "learning_rate": 0.0001698119042667056,
+      "loss": 0.6767,
+      "step": 691
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.40033377797346864,
+      "learning_rate": 0.00016971909012702483,
+      "loss": 0.6137,
+      "step": 692
+    },
+    {
+      "epoch": 0.2772,
+      "grad_norm": 0.3997013259402933,
+      "learning_rate": 0.00016962615897601573,
+      "loss": 0.6606,
+      "step": 693
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.3656529046170065,
+      "learning_rate": 0.00016953311096964705,
+      "loss": 0.6594,
+      "step": 694
+    },
+    {
+      "epoch": 0.278,
+      "grad_norm": 0.4052724353425028,
+      "learning_rate": 0.00016943994626408363,
+      "loss": 0.6518,
+      "step": 695
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.36842188704362194,
+      "learning_rate": 0.00016934666501568617,
+      "loss": 0.6173,
+      "step": 696
+    },
+    {
+      "epoch": 0.2788,
+      "grad_norm": 0.3647712480521673,
+      "learning_rate": 0.00016925326738101098,
+      "loss": 0.6558,
+      "step": 697
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.37886961387347196,
+      "learning_rate": 0.00016915975351680968,
+      "loss": 0.6386,
+      "step": 698
+    },
+    {
+      "epoch": 0.2796,
+      "grad_norm": 0.3685950283804296,
+      "learning_rate": 0.000169066123580029,
+      "loss": 0.6264,
+      "step": 699
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3633280590433798,
+      "learning_rate": 0.00016897237772781044,
+      "loss": 0.6638,
+      "step": 700
+    },
+    {
+      "epoch": 0.2804,
+      "grad_norm": 0.3732030051110284,
+      "learning_rate": 0.00016887851611749005,
+      "loss": 0.655,
+      "step": 701
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.3651617189481963,
+      "learning_rate": 0.00016878453890659814,
+      "loss": 0.7264,
+      "step": 702
+    },
+    {
+      "epoch": 0.2812,
+      "grad_norm": 0.36780411902884236,
+      "learning_rate": 0.0001686904462528591,
+      "loss": 0.6849,
+      "step": 703
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.35258092853914086,
+      "learning_rate": 0.000168596238314191,
+      "loss": 0.6552,
+      "step": 704
+    },
+    {
+      "epoch": 0.282,
+      "grad_norm": 0.3723815062634396,
+      "learning_rate": 0.00016850191524870546,
+      "loss": 0.6614,
+      "step": 705
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.3639460261569966,
+      "learning_rate": 0.00016840747721470731,
+      "loss": 0.6867,
+      "step": 706
+    },
+    {
+      "epoch": 0.2828,
+      "grad_norm": 0.37330295108405304,
+      "learning_rate": 0.00016831292437069427,
+      "loss": 0.6791,
+      "step": 707
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.3738837669495127,
+      "learning_rate": 0.00016821825687535674,
+      "loss": 0.6462,
+      "step": 708
+    },
+    {
+      "epoch": 0.2836,
+      "grad_norm": 0.3713143980678414,
+      "learning_rate": 0.00016812347488757772,
+      "loss": 0.6776,
+      "step": 709
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.3956676625326154,
+      "learning_rate": 0.00016802857856643215,
+      "loss": 0.7719,
+      "step": 710
+    },
+    {
+      "epoch": 0.2844,
+      "grad_norm": 0.3633848095765644,
+      "learning_rate": 0.00016793356807118695,
+      "loss": 0.7062,
+      "step": 711
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.37781250994760474,
+      "learning_rate": 0.00016783844356130071,
+      "loss": 0.6277,
+      "step": 712
+    },
+    {
+      "epoch": 0.2852,
+      "grad_norm": 0.37066789686772716,
+      "learning_rate": 0.0001677432051964233,
+      "loss": 0.6883,
+      "step": 713
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.34506269849481763,
+      "learning_rate": 0.0001676478531363957,
+      "loss": 0.6532,
+      "step": 714
+    },
+    {
+      "epoch": 0.286,
+      "grad_norm": 0.3789970406349399,
+      "learning_rate": 0.00016755238754124965,
+      "loss": 0.6672,
+      "step": 715
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.36544272597592525,
+      "learning_rate": 0.00016745680857120757,
+      "loss": 0.6744,
+      "step": 716
+    },
+    {
+      "epoch": 0.2868,
+      "grad_norm": 0.3728910819407148,
+      "learning_rate": 0.00016736111638668204,
+      "loss": 0.6433,
+      "step": 717
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.3549090324306322,
+      "learning_rate": 0.00016726531114827573,
+      "loss": 0.5937,
+      "step": 718
+    },
+    {
+      "epoch": 0.2876,
+      "grad_norm": 0.35314191620570956,
+      "learning_rate": 0.00016716939301678098,
+      "loss": 0.6245,
+      "step": 719
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.35969773999782906,
+      "learning_rate": 0.00016707336215317968,
+      "loss": 0.6847,
+      "step": 720
+    },
+    {
+      "epoch": 0.2884,
+      "grad_norm": 0.4214046623787497,
+      "learning_rate": 0.00016697721871864284,
+      "loss": 0.7331,
+      "step": 721
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.35052029199312573,
+      "learning_rate": 0.00016688096287453046,
+      "loss": 0.6652,
+      "step": 722
+    },
+    {
+      "epoch": 0.2892,
+      "grad_norm": 0.36484181678208727,
+      "learning_rate": 0.00016678459478239118,
+      "loss": 0.6677,
+      "step": 723
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.3790540606763471,
+      "learning_rate": 0.00016668811460396202,
+      "loss": 0.6642,
+      "step": 724
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.36849657826638405,
+      "learning_rate": 0.00016659152250116812,
+      "loss": 0.6618,
+      "step": 725
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.3892714920294252,
+      "learning_rate": 0.00016649481863612248,
+      "loss": 0.6795,
+      "step": 726
+    },
+    {
+      "epoch": 0.2908,
+      "grad_norm": 0.35662611125712923,
+      "learning_rate": 0.0001663980031711257,
+      "loss": 0.5695,
+      "step": 727
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.40575267772496953,
+      "learning_rate": 0.00016630107626866558,
+      "loss": 0.7215,
+      "step": 728
+    },
+    {
+      "epoch": 0.2916,
+      "grad_norm": 0.372483669398152,
+      "learning_rate": 0.00016620403809141705,
+      "loss": 0.6797,
+      "step": 729
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.36510913163052217,
+      "learning_rate": 0.00016610688880224178,
+      "loss": 0.6857,
+      "step": 730
+    },
+    {
+      "epoch": 0.2924,
+      "grad_norm": 0.38278306058860057,
+      "learning_rate": 0.00016600962856418782,
+      "loss": 0.632,
+      "step": 731
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.3610059628511899,
+      "learning_rate": 0.00016591225754048963,
+      "loss": 0.6671,
+      "step": 732
+    },
+    {
+      "epoch": 0.2932,
+      "grad_norm": 0.35661460366895475,
+      "learning_rate": 0.00016581477589456734,
+      "loss": 0.6547,
+      "step": 733
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.3476447348320809,
+      "learning_rate": 0.00016571718379002705,
+      "loss": 0.6387,
+      "step": 734
+    },
+    {
+      "epoch": 0.294,
+      "grad_norm": 0.3692442697306115,
+      "learning_rate": 0.00016561948139065996,
+      "loss": 0.7041,
+      "step": 735
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.35450747006735767,
+      "learning_rate": 0.00016552166886044253,
+      "loss": 0.7146,
+      "step": 736
+    },
+    {
+      "epoch": 0.2948,
+      "grad_norm": 0.3669567893885407,
+      "learning_rate": 0.00016542374636353604,
+      "loss": 0.6699,
+      "step": 737
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.3594868978946008,
+      "learning_rate": 0.0001653257140642863,
+      "loss": 0.6495,
+      "step": 738
+    },
+    {
+      "epoch": 0.2956,
+      "grad_norm": 0.3634931926296147,
+      "learning_rate": 0.00016522757212722344,
+      "loss": 0.6649,
+      "step": 739
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.350291190620466,
+      "learning_rate": 0.00016512932071706152,
+      "loss": 0.6881,
+      "step": 740
+    },
+    {
+      "epoch": 0.2964,
+      "grad_norm": 0.3610260685199456,
+      "learning_rate": 0.0001650309599986985,
+      "loss": 0.6776,
+      "step": 741
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.3569659676953712,
+      "learning_rate": 0.00016493249013721558,
+      "loss": 0.6505,
+      "step": 742
+    },
+    {
+      "epoch": 0.2972,
+      "grad_norm": 0.36554646148471603,
+      "learning_rate": 0.00016483391129787727,
+      "loss": 0.6614,
+      "step": 743
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.350784920726585,
+      "learning_rate": 0.000164735223646131,
+      "loss": 0.6603,
+      "step": 744
+    },
+    {
+      "epoch": 0.298,
+      "grad_norm": 0.347788847751402,
+      "learning_rate": 0.0001646364273476067,
+      "loss": 0.6799,
+      "step": 745
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.3626403008330356,
+      "learning_rate": 0.00016453752256811674,
+      "loss": 0.6834,
+      "step": 746
+    },
+    {
+      "epoch": 0.2988,
+      "grad_norm": 0.3657655470550385,
+      "learning_rate": 0.00016443850947365558,
+      "loss": 0.6584,
+      "step": 747
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.34036129141990146,
+      "learning_rate": 0.0001643393882303994,
+      "loss": 0.6319,
+      "step": 748
+    },
+    {
+      "epoch": 0.2996,
+      "grad_norm": 0.35126675290236997,
+      "learning_rate": 0.00016424015900470587,
+      "loss": 0.6651,
+      "step": 749
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.35391930145941913,
+      "learning_rate": 0.000164140821963114,
+      "loss": 0.6863,
+      "step": 750
+    },
+    {
+      "epoch": 0.3004,
+      "grad_norm": 0.370861156187132,
+      "learning_rate": 0.00016404137727234365,
+      "loss": 0.6459,
+      "step": 751
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.35987999591978354,
+      "learning_rate": 0.00016394182509929536,
+      "loss": 0.672,
+      "step": 752
+    },
+    {
+      "epoch": 0.3012,
+      "grad_norm": 0.3587536922915207,
+      "learning_rate": 0.00016384216561105014,
+      "loss": 0.6978,
+      "step": 753
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.3511980224029837,
+      "learning_rate": 0.000163742398974869,
+      "loss": 0.6422,
+      "step": 754
+    },
+    {
+      "epoch": 0.302,
+      "grad_norm": 0.34901958545175216,
+      "learning_rate": 0.00016364252535819282,
+      "loss": 0.6684,
+      "step": 755
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.33961478583723576,
+      "learning_rate": 0.00016354254492864211,
+      "loss": 0.6703,
+      "step": 756
+    },
+    {
+      "epoch": 0.3028,
+      "grad_norm": 0.34445952568829874,
+      "learning_rate": 0.00016344245785401653,
+      "loss": 0.6511,
+      "step": 757
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.35753721007263206,
+      "learning_rate": 0.00016334226430229475,
+      "loss": 0.7074,
+      "step": 758
+    },
+    {
+      "epoch": 0.3036,
+      "grad_norm": 0.37499914558856595,
+      "learning_rate": 0.00016324196444163423,
+      "loss": 0.7285,
+      "step": 759
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3630558022939141,
+      "learning_rate": 0.00016314155844037074,
+      "loss": 0.6575,
+      "step": 760
+    },
+    {
+      "epoch": 0.3044,
+      "grad_norm": 0.35329110139187386,
+      "learning_rate": 0.0001630410464670182,
+      "loss": 0.6434,
+      "step": 761
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.3639095040912134,
+      "learning_rate": 0.00016294042869026851,
+      "loss": 0.648,
+      "step": 762
+    },
+    {
+      "epoch": 0.3052,
+      "grad_norm": 0.3843760678407015,
+      "learning_rate": 0.000162839705278991,
+      "loss": 0.6508,
+      "step": 763
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.3872124608722155,
+      "learning_rate": 0.0001627388764022323,
+      "loss": 0.6649,
+      "step": 764
+    },
+    {
+      "epoch": 0.306,
+      "grad_norm": 0.3768132943244325,
+      "learning_rate": 0.0001626379422292162,
+      "loss": 0.6286,
+      "step": 765
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.36204946392202214,
+      "learning_rate": 0.000162536902929343,
+      "loss": 0.6278,
+      "step": 766
+    },
+    {
+      "epoch": 0.3068,
+      "grad_norm": 0.36082703461295695,
+      "learning_rate": 0.00016243575867218958,
+      "loss": 0.6616,
+      "step": 767
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3484587533493218,
+      "learning_rate": 0.00016233450962750893,
+      "loss": 0.6315,
+      "step": 768
+    },
+    {
+      "epoch": 0.3076,
+      "grad_norm": 0.34636803888691087,
+      "learning_rate": 0.00016223315596522987,
+      "loss": 0.6898,
+      "step": 769
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.38751438455311005,
+      "learning_rate": 0.0001621316978554569,
+      "loss": 0.7102,
+      "step": 770
+    },
+    {
+      "epoch": 0.3084,
+      "grad_norm": 0.36662104390459777,
+      "learning_rate": 0.00016203013546846966,
+      "loss": 0.7066,
+      "step": 771
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.35577550680251185,
+      "learning_rate": 0.00016192846897472297,
+      "loss": 0.6634,
+      "step": 772
+    },
+    {
+      "epoch": 0.3092,
+      "grad_norm": 0.36344365908813225,
+      "learning_rate": 0.0001618266985448463,
+      "loss": 0.6888,
+      "step": 773
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.35338050166466156,
+      "learning_rate": 0.00016172482434964353,
+      "loss": 0.6299,
+      "step": 774
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.36305744063960965,
+      "learning_rate": 0.00016162284656009274,
+      "loss": 0.6935,
+      "step": 775
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.37054085691252775,
+      "learning_rate": 0.00016152076534734584,
+      "loss": 0.6571,
+      "step": 776
+    },
+    {
+      "epoch": 0.3108,
+      "grad_norm": 0.3950689393757759,
+      "learning_rate": 0.00016141858088272837,
+      "loss": 0.7024,
+      "step": 777
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.35527100014035695,
+      "learning_rate": 0.00016131629333773908,
+      "loss": 0.6599,
+      "step": 778
+    },
+    {
+      "epoch": 0.3116,
+      "grad_norm": 0.37204052038957597,
+      "learning_rate": 0.0001612139028840498,
+      "loss": 0.7039,
+      "step": 779
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.35028572528380963,
+      "learning_rate": 0.00016111140969350503,
+      "loss": 0.6679,
+      "step": 780
+    },
+    {
+      "epoch": 0.3124,
+      "grad_norm": 0.3952564467547976,
+      "learning_rate": 0.0001610088139381217,
+      "loss": 0.7119,
+      "step": 781
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.3589953166918137,
+      "learning_rate": 0.00016090611579008888,
+      "loss": 0.6773,
+      "step": 782
+    },
+    {
+      "epoch": 0.3132,
+      "grad_norm": 0.35319758705458487,
+      "learning_rate": 0.00016080331542176753,
+      "loss": 0.6585,
+      "step": 783
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3677613546602713,
+      "learning_rate": 0.00016070041300569012,
+      "loss": 0.6966,
+      "step": 784
+    },
+    {
+      "epoch": 0.314,
+      "grad_norm": 0.38700046926645354,
+      "learning_rate": 0.00016059740871456036,
+      "loss": 0.6679,
+      "step": 785
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.37625609996376963,
+      "learning_rate": 0.000160494302721253,
+      "loss": 0.6678,
+      "step": 786
+    },
+    {
+      "epoch": 0.3148,
+      "grad_norm": 0.37295043227958613,
+      "learning_rate": 0.0001603910951988135,
+      "loss": 0.685,
+      "step": 787
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3560683487097046,
+      "learning_rate": 0.00016028778632045762,
+      "loss": 0.6134,
+      "step": 788
+    },
+    {
+      "epoch": 0.3156,
+      "grad_norm": 0.3601034178766075,
+      "learning_rate": 0.00016018437625957133,
+      "loss": 0.6901,
+      "step": 789
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.3705501734874379,
+      "learning_rate": 0.00016008086518971037,
+      "loss": 0.7209,
+      "step": 790
+    },
+    {
+      "epoch": 0.3164,
+      "grad_norm": 0.3914966449738789,
+      "learning_rate": 0.0001599772532846,
+      "loss": 0.6661,
+      "step": 791
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.38500568784698896,
+      "learning_rate": 0.0001598735407181347,
+      "loss": 0.6657,
+      "step": 792
+    },
+    {
+      "epoch": 0.3172,
+      "grad_norm": 0.3667191488097956,
+      "learning_rate": 0.00015976972766437795,
+      "loss": 0.6769,
+      "step": 793
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.3740913595595519,
+      "learning_rate": 0.00015966581429756183,
+      "loss": 0.6601,
+      "step": 794
+    },
+    {
+      "epoch": 0.318,
+      "grad_norm": 0.354469593288666,
+      "learning_rate": 0.00015956180079208682,
+      "loss": 0.6903,
+      "step": 795
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.35637752265785166,
+      "learning_rate": 0.00015945768732252144,
+      "loss": 0.6529,
+      "step": 796
+    },
+    {
+      "epoch": 0.3188,
+      "grad_norm": 0.34671445467104656,
+      "learning_rate": 0.00015935347406360192,
+      "loss": 0.6213,
+      "step": 797
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.41260532402747196,
+      "learning_rate": 0.00015924916119023212,
+      "loss": 0.7666,
+      "step": 798
+    },
+    {
+      "epoch": 0.3196,
+      "grad_norm": 0.37980397342432365,
+      "learning_rate": 0.00015914474887748295,
+      "loss": 0.688,
+      "step": 799
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3471779827666835,
+      "learning_rate": 0.00015904023730059228,
+      "loss": 0.6414,
+      "step": 800
+    },
+    {
+      "epoch": 0.3204,
+      "grad_norm": 0.39442178437895997,
+      "learning_rate": 0.0001589356266349645,
+      "loss": 0.702,
+      "step": 801
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.36392050739465226,
+      "learning_rate": 0.00015883091705617045,
+      "loss": 0.685,
+      "step": 802
+    },
+    {
+      "epoch": 0.3212,
+      "grad_norm": 0.3730467393953074,
+      "learning_rate": 0.00015872610873994685,
+      "loss": 0.6942,
+      "step": 803
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.351615133925163,
+      "learning_rate": 0.00015862120186219613,
+      "loss": 0.6402,
+      "step": 804
+    },
+    {
+      "epoch": 0.322,
+      "grad_norm": 0.3595595588948544,
+      "learning_rate": 0.00015851619659898623,
+      "loss": 0.6736,
+      "step": 805
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.41516559446603896,
+      "learning_rate": 0.00015841109312655016,
+      "loss": 0.6576,
+      "step": 806
+    },
+    {
+      "epoch": 0.3228,
+      "grad_norm": 0.35512516476410827,
+      "learning_rate": 0.00015830589162128572,
+      "loss": 0.6371,
+      "step": 807
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.36836803104245197,
+      "learning_rate": 0.00015820059225975531,
+      "loss": 0.6896,
+      "step": 808
+    },
+    {
+      "epoch": 0.3236,
+      "grad_norm": 0.3968393493789091,
+      "learning_rate": 0.0001580951952186856,
+      "loss": 0.7132,
+      "step": 809
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.36669128320751304,
+      "learning_rate": 0.000157989700674967,
+      "loss": 0.6834,
+      "step": 810
+    },
+    {
+      "epoch": 0.3244,
+      "grad_norm": 0.37771858320485735,
+      "learning_rate": 0.00015788410880565379,
+      "loss": 0.7076,
+      "step": 811
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.3653533497184648,
+      "learning_rate": 0.00015777841978796347,
+      "loss": 0.686,
+      "step": 812
+    },
+    {
+      "epoch": 0.3252,
+      "grad_norm": 0.3719804998835032,
+      "learning_rate": 0.0001576726337992766,
+      "loss": 0.7133,
+      "step": 813
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.34430137750629114,
+      "learning_rate": 0.00015756675101713657,
+      "loss": 0.6281,
+      "step": 814
+    },
+    {
+      "epoch": 0.326,
+      "grad_norm": 0.3874952604990868,
+      "learning_rate": 0.00015746077161924905,
+      "loss": 0.6883,
+      "step": 815
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3602363795621833,
+      "learning_rate": 0.00015735469578348208,
+      "loss": 0.658,
+      "step": 816
+    },
+    {
+      "epoch": 0.3268,
+      "grad_norm": 0.3678885252566548,
+      "learning_rate": 0.00015724852368786537,
+      "loss": 0.6451,
+      "step": 817
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.37231861031745034,
+      "learning_rate": 0.0001571422555105903,
+      "loss": 0.6797,
+      "step": 818
+    },
+    {
+      "epoch": 0.3276,
+      "grad_norm": 0.36582627986791816,
+      "learning_rate": 0.0001570358914300094,
+      "loss": 0.6686,
+      "step": 819
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3751003188638191,
+      "learning_rate": 0.00015692943162463628,
+      "loss": 0.6865,
+      "step": 820
+    },
+    {
+      "epoch": 0.3284,
+      "grad_norm": 0.35279218250418054,
+      "learning_rate": 0.00015682287627314515,
+      "loss": 0.6436,
+      "step": 821
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.39811222024035187,
+      "learning_rate": 0.00015671622555437053,
+      "loss": 0.6913,
+      "step": 822
+    },
+    {
+      "epoch": 0.3292,
+      "grad_norm": 0.35173620983932047,
+      "learning_rate": 0.00015660947964730708,
+      "loss": 0.6351,
+      "step": 823
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.35186063275235907,
+      "learning_rate": 0.0001565026387311092,
+      "loss": 0.6351,
+      "step": 824
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.3582217383025288,
+      "learning_rate": 0.00015639570298509064,
+      "loss": 0.6668,
+      "step": 825
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.3528654454610766,
+      "learning_rate": 0.0001562886725887245,
+      "loss": 0.6337,
+      "step": 826
+    },
+    {
+      "epoch": 0.3308,
+      "grad_norm": 0.3779509939019467,
+      "learning_rate": 0.00015618154772164256,
+      "loss": 0.7223,
+      "step": 827
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.36399592374656126,
+      "learning_rate": 0.00015607432856363525,
+      "loss": 0.6322,
+      "step": 828
+    },
+    {
+      "epoch": 0.3316,
+      "grad_norm": 0.3555705327070775,
+      "learning_rate": 0.00015596701529465117,
+      "loss": 0.609,
+      "step": 829
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.37182042716754116,
+      "learning_rate": 0.00015585960809479696,
+      "loss": 0.6626,
+      "step": 830
+    },
+    {
+      "epoch": 0.3324,
+      "grad_norm": 0.3668089208734812,
+      "learning_rate": 0.00015575210714433686,
+      "loss": 0.6255,
+      "step": 831
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3637504693784944,
+      "learning_rate": 0.00015564451262369247,
+      "loss": 0.6611,
+      "step": 832
+    },
+    {
+      "epoch": 0.3332,
+      "grad_norm": 0.36434482436667015,
+      "learning_rate": 0.00015553682471344238,
+      "loss": 0.7383,
+      "step": 833
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.39528366147657606,
+      "learning_rate": 0.00015542904359432198,
+      "loss": 0.6729,
+      "step": 834
+    },
+    {
+      "epoch": 0.334,
+      "grad_norm": 0.3606252529006789,
+      "learning_rate": 0.00015532116944722308,
+      "loss": 0.6206,
+      "step": 835
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3431618001058578,
+      "learning_rate": 0.00015521320245319363,
+      "loss": 0.6346,
+      "step": 836
+    },
+    {
+      "epoch": 0.3348,
+      "grad_norm": 0.35978080940778,
+      "learning_rate": 0.00015510514279343734,
+      "loss": 0.6922,
+      "step": 837
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.3511258242838397,
+      "learning_rate": 0.00015499699064931355,
+      "loss": 0.6275,
+      "step": 838
+    },
+    {
+      "epoch": 0.3356,
+      "grad_norm": 0.35577057580095667,
+      "learning_rate": 0.00015488874620233674,
+      "loss": 0.6601,
+      "step": 839
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4011721496975597,
+      "learning_rate": 0.0001547804096341763,
+      "loss": 0.6742,
+      "step": 840
+    },
+    {
+      "epoch": 0.3364,
+      "grad_norm": 0.3781361898780127,
+      "learning_rate": 0.00015467198112665632,
+      "loss": 0.6873,
+      "step": 841
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.3585323822428953,
+      "learning_rate": 0.0001545634608617551,
+      "loss": 0.6562,
+      "step": 842
+    },
+    {
+      "epoch": 0.3372,
+      "grad_norm": 0.36587737160692985,
+      "learning_rate": 0.00015445484902160491,
+      "loss": 0.6777,
+      "step": 843
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.3789378464952347,
+      "learning_rate": 0.00015434614578849188,
+      "loss": 0.6335,
+      "step": 844
+    },
+    {
+      "epoch": 0.338,
+      "grad_norm": 0.3605160743547415,
+      "learning_rate": 0.00015423735134485536,
+      "loss": 0.6728,
+      "step": 845
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.3636761377770127,
+      "learning_rate": 0.00015412846587328782,
+      "loss": 0.6506,
+      "step": 846
+    },
+    {
+      "epoch": 0.3388,
+      "grad_norm": 0.3488872370925301,
+      "learning_rate": 0.0001540194895565346,
+      "loss": 0.65,
+      "step": 847
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.375624221182326,
+      "learning_rate": 0.00015391042257749336,
+      "loss": 0.671,
+      "step": 848
+    },
+    {
+      "epoch": 0.3396,
+      "grad_norm": 0.35330262247588673,
+      "learning_rate": 0.00015380126511921403,
+      "loss": 0.6714,
+      "step": 849
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.35426489657914634,
+      "learning_rate": 0.0001536920173648984,
+      "loss": 0.669,
+      "step": 850
+    },
+    {
+      "epoch": 0.3404,
+      "grad_norm": 0.3723971472909655,
+      "learning_rate": 0.00015358267949789966,
+      "loss": 0.6926,
+      "step": 851
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.3925138623019826,
+      "learning_rate": 0.00015347325170172245,
+      "loss": 0.6884,
+      "step": 852
+    },
+    {
+      "epoch": 0.3412,
+      "grad_norm": 0.3737024778126771,
+      "learning_rate": 0.0001533637341600221,
+      "loss": 0.6918,
+      "step": 853
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.36923274420514496,
+      "learning_rate": 0.0001532541270566049,
+      "loss": 0.6423,
+      "step": 854
+    },
+    {
+      "epoch": 0.342,
+      "grad_norm": 0.3650733794413604,
+      "learning_rate": 0.00015314443057542703,
+      "loss": 0.6589,
+      "step": 855
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.39054808959550796,
+      "learning_rate": 0.00015303464490059506,
+      "loss": 0.6576,
+      "step": 856
+    },
+    {
+      "epoch": 0.3428,
+      "grad_norm": 0.3586992537018854,
+      "learning_rate": 0.00015292477021636497,
+      "loss": 0.6529,
+      "step": 857
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.37092971161160443,
+      "learning_rate": 0.0001528148067071423,
+      "loss": 0.6625,
+      "step": 858
+    },
+    {
+      "epoch": 0.3436,
+      "grad_norm": 0.33410515280317205,
+      "learning_rate": 0.00015270475455748166,
+      "loss": 0.632,
+      "step": 859
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.3640275136273407,
+      "learning_rate": 0.00015259461395208628,
+      "loss": 0.662,
+      "step": 860
+    },
+    {
+      "epoch": 0.3444,
+      "grad_norm": 0.3613335383897598,
+      "learning_rate": 0.00015248438507580806,
+      "loss": 0.6969,
+      "step": 861
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.35592397239382606,
+      "learning_rate": 0.00015237406811364682,
+      "loss": 0.6969,
+      "step": 862
+    },
+    {
+      "epoch": 0.3452,
+      "grad_norm": 0.39705521362789375,
+      "learning_rate": 0.0001522636632507504,
+      "loss": 0.6707,
+      "step": 863
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.36112856011278505,
+      "learning_rate": 0.00015215317067241414,
+      "loss": 0.6631,
+      "step": 864
+    },
+    {
+      "epoch": 0.346,
+      "grad_norm": 0.38273950968861264,
+      "learning_rate": 0.00015204259056408046,
+      "loss": 0.667,
+      "step": 865
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.3783694059874637,
+      "learning_rate": 0.00015193192311133884,
+      "loss": 0.6663,
+      "step": 866
+    },
+    {
+      "epoch": 0.3468,
+      "grad_norm": 0.366434677879186,
+      "learning_rate": 0.00015182116849992526,
+      "loss": 0.6166,
+      "step": 867
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.3458935991132749,
+      "learning_rate": 0.00015171032691572206,
+      "loss": 0.6834,
+      "step": 868
+    },
+    {
+      "epoch": 0.3476,
+      "grad_norm": 0.36496087348892964,
+      "learning_rate": 0.00015159939854475743,
+      "loss": 0.6704,
+      "step": 869
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.3760305334945727,
+      "learning_rate": 0.00015148838357320537,
+      "loss": 0.6935,
+      "step": 870
+    },
+    {
+      "epoch": 0.3484,
+      "grad_norm": 0.35850665616931604,
+      "learning_rate": 0.00015137728218738502,
+      "loss": 0.6872,
+      "step": 871
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3928618937253879,
+      "learning_rate": 0.0001512660945737608,
+      "loss": 0.6643,
+      "step": 872
+    },
+    {
+      "epoch": 0.3492,
+      "grad_norm": 0.35046941936035353,
+      "learning_rate": 0.00015115482091894165,
+      "loss": 0.6654,
+      "step": 873
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.3685208949359589,
+      "learning_rate": 0.00015104346140968095,
+      "loss": 0.6759,
+      "step": 874
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.34450173958566066,
+      "learning_rate": 0.00015093201623287631,
+      "loss": 0.6523,
+      "step": 875
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.36555178856164366,
+      "learning_rate": 0.00015082048557556893,
+      "loss": 0.625,
+      "step": 876
+    },
+    {
+      "epoch": 0.3508,
+      "grad_norm": 0.37279078820571543,
+      "learning_rate": 0.00015070886962494358,
+      "loss": 0.6755,
+      "step": 877
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.3467247782349474,
+      "learning_rate": 0.0001505971685683282,
+      "loss": 0.636,
+      "step": 878
+    },
+    {
+      "epoch": 0.3516,
+      "grad_norm": 0.35286283300836774,
+      "learning_rate": 0.00015048538259319346,
+      "loss": 0.6466,
+      "step": 879
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3396382068456816,
+      "learning_rate": 0.00015037351188715265,
+      "loss": 0.6633,
+      "step": 880
+    },
+    {
+      "epoch": 0.3524,
+      "grad_norm": 0.35316683921346637,
+      "learning_rate": 0.00015026155663796123,
+      "loss": 0.6227,
+      "step": 881
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.3846584919822831,
+      "learning_rate": 0.00015014951703351653,
+      "loss": 0.6311,
+      "step": 882
+    },
+    {
+      "epoch": 0.3532,
+      "grad_norm": 0.3569064084982436,
+      "learning_rate": 0.00015003739326185751,
+      "loss": 0.6781,
+      "step": 883
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.3649128002305385,
+      "learning_rate": 0.00014992518551116434,
+      "loss": 0.6731,
+      "step": 884
+    },
+    {
+      "epoch": 0.354,
+      "grad_norm": 0.3723253026564224,
+      "learning_rate": 0.00014981289396975817,
+      "loss": 0.6747,
+      "step": 885
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.363202738117168,
+      "learning_rate": 0.0001497005188261007,
+      "loss": 0.6734,
+      "step": 886
+    },
+    {
+      "epoch": 0.3548,
+      "grad_norm": 0.3548792469765704,
+      "learning_rate": 0.0001495880602687941,
+      "loss": 0.6462,
+      "step": 887
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3474115630121927,
+      "learning_rate": 0.00014947551848658034,
+      "loss": 0.6577,
+      "step": 888
+    },
+    {
+      "epoch": 0.3556,
+      "grad_norm": 0.374218002987988,
+      "learning_rate": 0.00014936289366834123,
+      "loss": 0.6991,
+      "step": 889
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.35089097506675154,
+      "learning_rate": 0.00014925018600309785,
+      "loss": 0.6623,
+      "step": 890
+    },
+    {
+      "epoch": 0.3564,
+      "grad_norm": 0.39265641433945575,
+      "learning_rate": 0.00014913739568001033,
+      "loss": 0.6654,
+      "step": 891
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3540771581716744,
+      "learning_rate": 0.0001490245228883776,
+      "loss": 0.6691,
+      "step": 892
+    },
+    {
+      "epoch": 0.3572,
+      "grad_norm": 0.35062203052174024,
+      "learning_rate": 0.0001489115678176369,
+      "loss": 0.617,
+      "step": 893
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.3867056359915103,
+      "learning_rate": 0.00014879853065736365,
+      "loss": 0.6904,
+      "step": 894
+    },
+    {
+      "epoch": 0.358,
+      "grad_norm": 0.3645808187567261,
+      "learning_rate": 0.00014868541159727096,
+      "loss": 0.6453,
+      "step": 895
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3624902031755957,
+      "learning_rate": 0.00014857221082720948,
+      "loss": 0.6612,
+      "step": 896
+    },
+    {
+      "epoch": 0.3588,
+      "grad_norm": 0.3542078534637287,
+      "learning_rate": 0.0001484589285371669,
+      "loss": 0.6572,
+      "step": 897
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.3629270925759756,
+      "learning_rate": 0.0001483455649172678,
+      "loss": 0.6454,
+      "step": 898
+    },
+    {
+      "epoch": 0.3596,
+      "grad_norm": 0.36443854767359396,
+      "learning_rate": 0.0001482321201577733,
+      "loss": 0.6501,
+      "step": 899
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.35841159262345307,
+      "learning_rate": 0.00014811859444908052,
+      "loss": 0.674,
+      "step": 900
+    },
+    {
+      "epoch": 0.3604,
+      "grad_norm": 0.3478107592885514,
+      "learning_rate": 0.0001480049879817226,
+      "loss": 0.66,
+      "step": 901
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.34446415507051853,
+      "learning_rate": 0.0001478913009463682,
+      "loss": 0.6475,
+      "step": 902
+    },
+    {
+      "epoch": 0.3612,
+      "grad_norm": 0.3357149083896519,
+      "learning_rate": 0.00014777753353382119,
+      "loss": 0.6466,
+      "step": 903
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3701287329337555,
+      "learning_rate": 0.00014766368593502026,
+      "loss": 0.6728,
+      "step": 904
+    },
+    {
+      "epoch": 0.362,
+      "grad_norm": 0.3652716631266611,
+      "learning_rate": 0.00014754975834103877,
+      "loss": 0.6557,
+      "step": 905
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.37214552315597893,
+      "learning_rate": 0.00014743575094308431,
+      "loss": 0.6652,
+      "step": 906
+    },
+    {
+      "epoch": 0.3628,
+      "grad_norm": 0.3555294949666416,
+      "learning_rate": 0.0001473216639324984,
+      "loss": 0.6431,
+      "step": 907
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.3521451446522717,
+      "learning_rate": 0.0001472074975007562,
+      "loss": 0.6534,
+      "step": 908
+    },
+    {
+      "epoch": 0.3636,
+      "grad_norm": 0.3476086077345556,
+      "learning_rate": 0.0001470932518394661,
+      "loss": 0.637,
+      "step": 909
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.37783200576792464,
+      "learning_rate": 0.00014697892714036958,
+      "loss": 0.6883,
+      "step": 910
+    },
+    {
+      "epoch": 0.3644,
+      "grad_norm": 0.3494756499260396,
+      "learning_rate": 0.00014686452359534066,
+      "loss": 0.6536,
+      "step": 911
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3491803081093194,
+      "learning_rate": 0.0001467500413963857,
+      "loss": 0.6926,
+      "step": 912
+    },
+    {
+      "epoch": 0.3652,
+      "grad_norm": 0.3747430627750039,
+      "learning_rate": 0.00014663548073564316,
+      "loss": 0.6423,
+      "step": 913
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.42084751004103793,
+      "learning_rate": 0.00014652084180538302,
+      "loss": 0.6585,
+      "step": 914
+    },
+    {
+      "epoch": 0.366,
+      "grad_norm": 0.3557562025796701,
+      "learning_rate": 0.00014640612479800686,
+      "loss": 0.6542,
+      "step": 915
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.38388321779715584,
+      "learning_rate": 0.00014629132990604706,
+      "loss": 0.6331,
+      "step": 916
+    },
+    {
+      "epoch": 0.3668,
+      "grad_norm": 0.3848509563350158,
+      "learning_rate": 0.00014617645732216685,
+      "loss": 0.6476,
+      "step": 917
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.36490760655892146,
+      "learning_rate": 0.00014606150723915984,
+      "loss": 0.6403,
+      "step": 918
+    },
+    {
+      "epoch": 0.3676,
+      "grad_norm": 0.3947551795151395,
+      "learning_rate": 0.00014594647984994964,
+      "loss": 0.6904,
+      "step": 919
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.35124246715619073,
+      "learning_rate": 0.00014583137534758967,
+      "loss": 0.6447,
+      "step": 920
+    },
+    {
+      "epoch": 0.3684,
+      "grad_norm": 0.36635027190183794,
+      "learning_rate": 0.00014571619392526278,
+      "loss": 0.6458,
+      "step": 921
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.3581306372172641,
+      "learning_rate": 0.0001456009357762809,
+      "loss": 0.6528,
+      "step": 922
+    },
+    {
+      "epoch": 0.3692,
+      "grad_norm": 0.3615759086065034,
+      "learning_rate": 0.00014548560109408466,
+      "loss": 0.7031,
+      "step": 923
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3807900684626195,
+      "learning_rate": 0.00014537019007224324,
+      "loss": 0.72,
+      "step": 924
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.38231568945229655,
+      "learning_rate": 0.00014525470290445392,
+      "loss": 0.6839,
+      "step": 925
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.34273098488248055,
+      "learning_rate": 0.00014513913978454168,
+      "loss": 0.6582,
+      "step": 926
+    },
+    {
+      "epoch": 0.3708,
+      "grad_norm": 0.39393100124395825,
+      "learning_rate": 0.00014502350090645917,
+      "loss": 0.6721,
+      "step": 927
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3400567054502143,
+      "learning_rate": 0.000144907786464286,
+      "loss": 0.6406,
+      "step": 928
+    },
+    {
+      "epoch": 0.3716,
+      "grad_norm": 0.3670211913921609,
+      "learning_rate": 0.0001447919966522287,
+      "loss": 0.6818,
+      "step": 929
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.35772860406848145,
+      "learning_rate": 0.00014467613166462023,
+      "loss": 0.6296,
+      "step": 930
+    },
+    {
+      "epoch": 0.3724,
+      "grad_norm": 0.3526738213667584,
+      "learning_rate": 0.00014456019169591978,
+      "loss": 0.6407,
+      "step": 931
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.34854679382294573,
+      "learning_rate": 0.0001444441769407124,
+      "loss": 0.648,
+      "step": 932
+    },
+    {
+      "epoch": 0.3732,
+      "grad_norm": 0.3621509291016752,
+      "learning_rate": 0.00014432808759370854,
+      "loss": 0.7082,
+      "step": 933
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.35524954321215996,
+      "learning_rate": 0.00014421192384974396,
+      "loss": 0.6422,
+      "step": 934
+    },
+    {
+      "epoch": 0.374,
+      "grad_norm": 0.3558940796138577,
+      "learning_rate": 0.00014409568590377918,
+      "loss": 0.6626,
+      "step": 935
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.33592802217161094,
+      "learning_rate": 0.0001439793739508994,
+      "loss": 0.6347,
+      "step": 936
+    },
+    {
+      "epoch": 0.3748,
+      "grad_norm": 0.3820145016362385,
+      "learning_rate": 0.00014386298818631386,
+      "loss": 0.6434,
+      "step": 937
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.3927205004390802,
+      "learning_rate": 0.0001437465288053558,
+      "loss": 0.718,
+      "step": 938
+    },
+    {
+      "epoch": 0.3756,
+      "grad_norm": 0.3518561450001955,
+      "learning_rate": 0.00014362999600348196,
+      "loss": 0.6321,
+      "step": 939
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.3627903941056215,
+      "learning_rate": 0.00014351338997627234,
+      "loss": 0.6589,
+      "step": 940
+    },
+    {
+      "epoch": 0.3764,
+      "grad_norm": 0.3419710630379931,
+      "learning_rate": 0.00014339671091942978,
+      "loss": 0.6235,
+      "step": 941
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.36879674846754606,
+      "learning_rate": 0.0001432799590287797,
+      "loss": 0.6674,
+      "step": 942
+    },
+    {
+      "epoch": 0.3772,
+      "grad_norm": 0.35366350852981354,
+      "learning_rate": 0.00014316313450026986,
+      "loss": 0.6382,
+      "step": 943
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.37921850015556424,
+      "learning_rate": 0.00014304623752996973,
+      "loss": 0.6263,
+      "step": 944
+    },
+    {
+      "epoch": 0.378,
+      "grad_norm": 0.3742507615630018,
+      "learning_rate": 0.00014292926831407061,
+      "loss": 0.6481,
+      "step": 945
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.37185581882115953,
+      "learning_rate": 0.0001428122270488848,
+      "loss": 0.6749,
+      "step": 946
+    },
+    {
+      "epoch": 0.3788,
+      "grad_norm": 0.36292397414328936,
+      "learning_rate": 0.00014269511393084572,
+      "loss": 0.6045,
+      "step": 947
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.36157941938220134,
+      "learning_rate": 0.00014257792915650728,
+      "loss": 0.615,
+      "step": 948
+    },
+    {
+      "epoch": 0.3796,
+      "grad_norm": 0.38269180434028977,
+      "learning_rate": 0.00014246067292254366,
+      "loss": 0.681,
+      "step": 949
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.3827510153886556,
+      "learning_rate": 0.00014234334542574906,
+      "loss": 0.6877,
+      "step": 950
+    },
+    {
+      "epoch": 0.3804,
+      "grad_norm": 0.37997491138841605,
+      "learning_rate": 0.00014222594686303706,
+      "loss": 0.6867,
+      "step": 951
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.36575592804829177,
+      "learning_rate": 0.00014210847743144087,
+      "loss": 0.6839,
+      "step": 952
+    },
+    {
+      "epoch": 0.3812,
+      "grad_norm": 0.3882126265654071,
+      "learning_rate": 0.00014199093732811225,
+      "loss": 0.616,
+      "step": 953
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.3543677696755192,
+      "learning_rate": 0.00014187332675032188,
+      "loss": 0.5746,
+      "step": 954
+    },
+    {
+      "epoch": 0.382,
+      "grad_norm": 0.3460644642182079,
+      "learning_rate": 0.00014175564589545854,
+      "loss": 0.676,
+      "step": 955
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.35503051555346704,
+      "learning_rate": 0.00014163789496102902,
+      "loss": 0.6153,
+      "step": 956
+    },
+    {
+      "epoch": 0.3828,
+      "grad_norm": 0.34896213065649373,
+      "learning_rate": 0.0001415200741446577,
+      "loss": 0.6904,
+      "step": 957
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.346168689259183,
+      "learning_rate": 0.00014140218364408632,
+      "loss": 0.6332,
+      "step": 958
+    },
+    {
+      "epoch": 0.3836,
+      "grad_norm": 0.3524594723123227,
+      "learning_rate": 0.00014128422365717347,
+      "loss": 0.6225,
+      "step": 959
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4070227595189606,
+      "learning_rate": 0.0001411661943818944,
+      "loss": 0.6878,
+      "step": 960
+    },
+    {
+      "epoch": 0.3844,
+      "grad_norm": 0.35668228011669395,
+      "learning_rate": 0.0001410480960163407,
+      "loss": 0.704,
+      "step": 961
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.35134169124857884,
+      "learning_rate": 0.00014092992875871979,
+      "loss": 0.6662,
+      "step": 962
+    },
+    {
+      "epoch": 0.3852,
+      "grad_norm": 0.35423691459425,
+      "learning_rate": 0.00014081169280735488,
+      "loss": 0.6708,
+      "step": 963
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.35015610010335474,
+      "learning_rate": 0.00014069338836068433,
+      "loss": 0.609,
+      "step": 964
+    },
+    {
+      "epoch": 0.386,
+      "grad_norm": 0.3499702691181545,
+      "learning_rate": 0.00014057501561726157,
+      "loss": 0.6391,
+      "step": 965
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.32312770115386513,
+      "learning_rate": 0.00014045657477575448,
+      "loss": 0.5978,
+      "step": 966
+    },
+    {
+      "epoch": 0.3868,
+      "grad_norm": 0.3748104243838437,
+      "learning_rate": 0.0001403380660349455,
+      "loss": 0.6848,
+      "step": 967
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3548359551898651,
+      "learning_rate": 0.00014021948959373076,
+      "loss": 0.6492,
+      "step": 968
+    },
+    {
+      "epoch": 0.3876,
+      "grad_norm": 0.35902687894741403,
+      "learning_rate": 0.0001401008456511202,
+      "loss": 0.681,
+      "step": 969
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.34628114641184954,
+      "learning_rate": 0.0001399821344062369,
+      "loss": 0.6307,
+      "step": 970
+    },
+    {
+      "epoch": 0.3884,
+      "grad_norm": 0.3444827567722983,
+      "learning_rate": 0.00013986335605831705,
+      "loss": 0.6209,
+      "step": 971
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.36743538427244765,
+      "learning_rate": 0.00013974451080670934,
+      "loss": 0.6141,
+      "step": 972
+    },
+    {
+      "epoch": 0.3892,
+      "grad_norm": 0.3935183203217268,
+      "learning_rate": 0.0001396255988508748,
+      "loss": 0.6755,
+      "step": 973
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.37448677909376504,
+      "learning_rate": 0.00013950662039038643,
+      "loss": 0.6585,
+      "step": 974
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.35524468770800793,
+      "learning_rate": 0.00013938757562492873,
+      "loss": 0.6477,
+      "step": 975
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3527436400724316,
+      "learning_rate": 0.00013926846475429766,
+      "loss": 0.6934,
+      "step": 976
+    },
+    {
+      "epoch": 0.3908,
+      "grad_norm": 0.3918038008179953,
+      "learning_rate": 0.00013914928797839995,
+      "loss": 0.723,
+      "step": 977
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.34877061301820206,
+      "learning_rate": 0.0001390300454972531,
+      "loss": 0.6628,
+      "step": 978
+    },
+    {
+      "epoch": 0.3916,
+      "grad_norm": 0.3938563882435424,
+      "learning_rate": 0.0001389107375109848,
+      "loss": 0.6691,
+      "step": 979
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.3526362195360042,
+      "learning_rate": 0.00013879136421983266,
+      "loss": 0.6761,
+      "step": 980
+    },
+    {
+      "epoch": 0.3924,
+      "grad_norm": 0.36384177194514367,
+      "learning_rate": 0.00013867192582414393,
+      "loss": 0.6573,
+      "step": 981
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.3483204591210129,
+      "learning_rate": 0.0001385524225243751,
+      "loss": 0.6345,
+      "step": 982
+    },
+    {
+      "epoch": 0.3932,
+      "grad_norm": 0.38026899558829796,
+      "learning_rate": 0.00013843285452109166,
+      "loss": 0.6617,
+      "step": 983
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3779346317714003,
+      "learning_rate": 0.00013831322201496757,
+      "loss": 0.6454,
+      "step": 984
+    },
+    {
+      "epoch": 0.394,
+      "grad_norm": 0.4100717507480477,
+      "learning_rate": 0.0001381935252067852,
+      "loss": 0.6689,
+      "step": 985
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.3571062943128852,
+      "learning_rate": 0.00013807376429743467,
+      "loss": 0.6299,
+      "step": 986
+    },
+    {
+      "epoch": 0.3948,
+      "grad_norm": 0.35142189144611696,
+      "learning_rate": 0.00013795393948791383,
+      "loss": 0.6647,
+      "step": 987
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.3633902967535515,
+      "learning_rate": 0.0001378340509793277,
+      "loss": 0.6273,
+      "step": 988
+    },
+    {
+      "epoch": 0.3956,
+      "grad_norm": 0.3590667520188268,
+      "learning_rate": 0.00013771409897288822,
+      "loss": 0.6305,
+      "step": 989
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.3605606122453971,
+      "learning_rate": 0.0001375940836699139,
+      "loss": 0.6957,
+      "step": 990
+    },
+    {
+      "epoch": 0.3964,
+      "grad_norm": 0.35831604074772716,
+      "learning_rate": 0.00013747400527182953,
+      "loss": 0.6694,
+      "step": 991
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.36450846842086443,
+      "learning_rate": 0.0001373538639801657,
+      "loss": 0.6336,
+      "step": 992
+    },
+    {
+      "epoch": 0.3972,
+      "grad_norm": 0.3504069378684052,
+      "learning_rate": 0.0001372336599965586,
+      "loss": 0.6714,
+      "step": 993
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.3628339670725981,
+      "learning_rate": 0.00013711339352274966,
+      "loss": 0.6611,
+      "step": 994
+    },
+    {
+      "epoch": 0.398,
+      "grad_norm": 0.35354002460595096,
+      "learning_rate": 0.0001369930647605852,
+      "loss": 0.6947,
+      "step": 995
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.3480541378031107,
+      "learning_rate": 0.00013687267391201605,
+      "loss": 0.6158,
+      "step": 996
+    },
+    {
+      "epoch": 0.3988,
+      "grad_norm": 0.3640618287141173,
+      "learning_rate": 0.00013675222117909717,
+      "loss": 0.6381,
+      "step": 997
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.35634838244330663,
+      "learning_rate": 0.00013663170676398752,
+      "loss": 0.6396,
+      "step": 998
+    },
+    {
+      "epoch": 0.3996,
+      "grad_norm": 0.3506478735382033,
+      "learning_rate": 0.00013651113086894952,
+      "loss": 0.6867,
+      "step": 999
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.35888751195771684,
+      "learning_rate": 0.00013639049369634876,
+      "loss": 0.6703,
+      "step": 1000
+    },
+    {
+      "epoch": 0.4004,
+      "grad_norm": 0.345483947255706,
+      "learning_rate": 0.00013626979544865367,
+      "loss": 0.6409,
+      "step": 1001
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.3840870344565062,
+      "learning_rate": 0.00013614903632843523,
+      "loss": 0.7254,
+      "step": 1002
+    },
+    {
+      "epoch": 0.4012,
+      "grad_norm": 0.34028673798735415,
+      "learning_rate": 0.00013602821653836654,
+      "loss": 0.6108,
+      "step": 1003
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3496563277340895,
+      "learning_rate": 0.0001359073362812225,
+      "loss": 0.649,
+      "step": 1004
+    },
+    {
+      "epoch": 0.402,
+      "grad_norm": 0.3590764777051446,
+      "learning_rate": 0.00013578639575987958,
+      "loss": 0.6573,
+      "step": 1005
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.36232084757956284,
+      "learning_rate": 0.00013566539517731536,
+      "loss": 0.5995,
+      "step": 1006
+    },
+    {
+      "epoch": 0.4028,
+      "grad_norm": 0.40296914583508997,
+      "learning_rate": 0.00013554433473660817,
+      "loss": 0.6468,
+      "step": 1007
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.34618356579834514,
+      "learning_rate": 0.0001354232146409368,
+      "loss": 0.6311,
+      "step": 1008
+    },
+    {
+      "epoch": 0.4036,
+      "grad_norm": 0.3699522185641742,
+      "learning_rate": 0.0001353020350935803,
+      "loss": 0.6515,
+      "step": 1009
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.3402055249662885,
+      "learning_rate": 0.00013518079629791724,
+      "loss": 0.6117,
+      "step": 1010
+    },
+    {
+      "epoch": 0.4044,
+      "grad_norm": 0.3520890315363541,
+      "learning_rate": 0.00013505949845742598,
+      "loss": 0.6479,
+      "step": 1011
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.3563001708547387,
+      "learning_rate": 0.00013493814177568364,
+      "loss": 0.6317,
+      "step": 1012
+    },
+    {
+      "epoch": 0.4052,
+      "grad_norm": 0.36934129376300595,
+      "learning_rate": 0.00013481672645636626,
+      "loss": 0.6336,
+      "step": 1013
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.34467273873477794,
+      "learning_rate": 0.00013469525270324835,
+      "loss": 0.6515,
+      "step": 1014
+    },
+    {
+      "epoch": 0.406,
+      "grad_norm": 0.37520625303944943,
+      "learning_rate": 0.0001345737207202023,
+      "loss": 0.6268,
+      "step": 1015
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.37153699344781604,
+      "learning_rate": 0.0001344521307111984,
+      "loss": 0.6908,
+      "step": 1016
+    },
+    {
+      "epoch": 0.4068,
+      "grad_norm": 0.3525054790816169,
+      "learning_rate": 0.00013433048288030423,
+      "loss": 0.6686,
+      "step": 1017
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.35007987426870385,
+      "learning_rate": 0.00013420877743168449,
+      "loss": 0.6506,
+      "step": 1018
+    },
+    {
+      "epoch": 0.4076,
+      "grad_norm": 0.33329849886499396,
+      "learning_rate": 0.0001340870145696005,
+      "loss": 0.588,
+      "step": 1019
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.3534071124831478,
+      "learning_rate": 0.00013396519449841005,
+      "loss": 0.6546,
+      "step": 1020
+    },
+    {
+      "epoch": 0.4084,
+      "grad_norm": 0.3583732560061697,
+      "learning_rate": 0.0001338433174225668,
+      "loss": 0.6202,
+      "step": 1021
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.3508190351005674,
+      "learning_rate": 0.0001337213835466202,
+      "loss": 0.6763,
+      "step": 1022
+    },
+    {
+      "epoch": 0.4092,
+      "grad_norm": 0.37536449187169085,
+      "learning_rate": 0.00013359939307521493,
+      "loss": 0.6846,
+      "step": 1023
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3687071147005669,
+      "learning_rate": 0.00013347734621309076,
+      "loss": 0.6739,
+      "step": 1024
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.3634034898624675,
+      "learning_rate": 0.00013335524316508208,
+      "loss": 0.6288,
+      "step": 1025
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.3732640277119394,
+      "learning_rate": 0.00013323308413611747,
+      "loss": 0.6606,
+      "step": 1026
+    },
+    {
+      "epoch": 0.4108,
+      "grad_norm": 0.351890195894751,
+      "learning_rate": 0.00013311086933121962,
+      "loss": 0.6635,
+      "step": 1027
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.39339991608869956,
+      "learning_rate": 0.00013298859895550472,
+      "loss": 0.6426,
+      "step": 1028
+    },
+    {
+      "epoch": 0.4116,
+      "grad_norm": 0.3302126076186118,
+      "learning_rate": 0.00013286627321418227,
+      "loss": 0.64,
+      "step": 1029
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.33616046205245464,
+      "learning_rate": 0.00013274389231255466,
+      "loss": 0.6597,
+      "step": 1030
+    },
+    {
+      "epoch": 0.4124,
+      "grad_norm": 0.6888957360247192,
+      "learning_rate": 0.00013262145645601692,
+      "loss": 0.6904,
+      "step": 1031
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3392597187306241,
+      "learning_rate": 0.00013249896585005628,
+      "loss": 0.6291,
+      "step": 1032
+    },
+    {
+      "epoch": 0.4132,
+      "grad_norm": 0.3592003259259342,
+      "learning_rate": 0.00013237642070025184,
+      "loss": 0.6974,
+      "step": 1033
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.4153925706260768,
+      "learning_rate": 0.0001322538212122742,
+      "loss": 0.6899,
+      "step": 1034
+    },
+    {
+      "epoch": 0.414,
+      "grad_norm": 0.37095096601650834,
+      "learning_rate": 0.00013213116759188523,
+      "loss": 0.6659,
+      "step": 1035
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.3592106483257176,
+      "learning_rate": 0.0001320084600449377,
+      "loss": 0.6997,
+      "step": 1036
+    },
+    {
+      "epoch": 0.4148,
+      "grad_norm": 0.36046345046306594,
+      "learning_rate": 0.00013188569877737474,
+      "loss": 0.6265,
+      "step": 1037
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.3613241628633839,
+      "learning_rate": 0.00013176288399522975,
+      "loss": 0.6563,
+      "step": 1038
+    },
+    {
+      "epoch": 0.4156,
+      "grad_norm": 0.3557392478888832,
+      "learning_rate": 0.0001316400159046259,
+      "loss": 0.6328,
+      "step": 1039
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.36421798184827514,
+      "learning_rate": 0.00013151709471177588,
+      "loss": 0.6568,
+      "step": 1040
+    },
+    {
+      "epoch": 0.4164,
+      "grad_norm": 0.3449672427354774,
+      "learning_rate": 0.0001313941206229814,
+      "loss": 0.6471,
+      "step": 1041
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.5251559097727022,
+      "learning_rate": 0.0001312710938446331,
+      "loss": 0.6507,
+      "step": 1042
+    },
+    {
+      "epoch": 0.4172,
+      "grad_norm": 0.3860052644841409,
+      "learning_rate": 0.00013114801458320987,
+      "loss": 0.6842,
+      "step": 1043
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.8066263740122651,
+      "learning_rate": 0.0001310248830452788,
+      "loss": 0.6253,
+      "step": 1044
+    },
+    {
+      "epoch": 0.418,
+      "grad_norm": 0.35514986303183,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.6209,
+      "step": 1045
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.36456989649396787,
+      "learning_rate": 0.00013077846396659985,
+      "loss": 0.6437,
+      "step": 1046
+    },
+    {
+      "epoch": 0.4188,
+      "grad_norm": 0.36435993468594363,
+      "learning_rate": 0.0001306551768394234,
+      "loss": 0.6373,
+      "step": 1047
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.35898161031201,
+      "learning_rate": 0.00013053183826288123,
+      "loss": 0.6257,
+      "step": 1048
+    },
+    {
+      "epoch": 0.4196,
+      "grad_norm": 0.34399594238893383,
+      "learning_rate": 0.00013040844844397574,
+      "loss": 0.6376,
+      "step": 1049
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.3639089445483288,
+      "learning_rate": 0.00013028500758979506,
+      "loss": 0.6595,
+      "step": 1050
+    },
+    {
+      "epoch": 0.4204,
+      "grad_norm": 0.3430020351995961,
+      "learning_rate": 0.0001301615159075133,
+      "loss": 0.6597,
+      "step": 1051
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.38058307582533973,
+      "learning_rate": 0.0001300379736043896,
+      "loss": 0.6555,
+      "step": 1052
+    },
+    {
+      "epoch": 0.4212,
+      "grad_norm": 0.3546300742504651,
+      "learning_rate": 0.00012991438088776817,
+      "loss": 0.6224,
+      "step": 1053
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.3525371684642542,
+      "learning_rate": 0.00012979073796507787,
+      "loss": 0.6439,
+      "step": 1054
+    },
+    {
+      "epoch": 0.422,
+      "grad_norm": 0.3574218454915876,
+      "learning_rate": 0.00012966704504383168,
+      "loss": 0.6391,
+      "step": 1055
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3383106126426222,
+      "learning_rate": 0.00012954330233162667,
+      "loss": 0.63,
+      "step": 1056
+    },
+    {
+      "epoch": 0.4228,
+      "grad_norm": 0.35807892428638477,
+      "learning_rate": 0.00012941951003614337,
+      "loss": 0.6249,
+      "step": 1057
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.3444483156769968,
+      "learning_rate": 0.00012929566836514554,
+      "loss": 0.6247,
+      "step": 1058
+    },
+    {
+      "epoch": 0.4236,
+      "grad_norm": 0.35707124361746045,
+      "learning_rate": 0.0001291717775264798,
+      "loss": 0.6771,
+      "step": 1059
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3590898231704314,
+      "learning_rate": 0.00012904783772807533,
+      "loss": 0.6532,
+      "step": 1060
+    },
+    {
+      "epoch": 0.4244,
+      "grad_norm": 0.3637411663002567,
+      "learning_rate": 0.00012892384917794346,
+      "loss": 0.6753,
+      "step": 1061
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.3461421346562393,
+      "learning_rate": 0.00012879981208417735,
+      "loss": 0.6455,
+      "step": 1062
+    },
+    {
+      "epoch": 0.4252,
+      "grad_norm": 0.35897628205140314,
+      "learning_rate": 0.00012867572665495157,
+      "loss": 0.6467,
+      "step": 1063
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.34729957726092625,
+      "learning_rate": 0.0001285515930985219,
+      "loss": 0.6485,
+      "step": 1064
+    },
+    {
+      "epoch": 0.426,
+      "grad_norm": 0.3471680462876662,
+      "learning_rate": 0.00012842741162322487,
+      "loss": 0.6186,
+      "step": 1065
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.4170020999870481,
+      "learning_rate": 0.00012830318243747736,
+      "loss": 0.633,
+      "step": 1066
+    },
+    {
+      "epoch": 0.4268,
+      "grad_norm": 0.3527236798223839,
+      "learning_rate": 0.00012817890574977646,
+      "loss": 0.6883,
+      "step": 1067
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.37101012646371456,
+      "learning_rate": 0.00012805458176869884,
+      "loss": 0.6815,
+      "step": 1068
+    },
+    {
+      "epoch": 0.4276,
+      "grad_norm": 0.36168116105218867,
+      "learning_rate": 0.00012793021070290066,
+      "loss": 0.7068,
+      "step": 1069
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.3580616421424723,
+      "learning_rate": 0.00012780579276111702,
+      "loss": 0.6534,
+      "step": 1070
+    },
+    {
+      "epoch": 0.4284,
+      "grad_norm": 0.34562766642889714,
+      "learning_rate": 0.00012768132815216173,
+      "loss": 0.6857,
+      "step": 1071
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.36003250403947595,
+      "learning_rate": 0.00012755681708492695,
+      "loss": 0.632,
+      "step": 1072
+    },
+    {
+      "epoch": 0.4292,
+      "grad_norm": 0.3604809695713053,
+      "learning_rate": 0.00012743225976838274,
+      "loss": 0.6744,
+      "step": 1073
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.3538437373444088,
+      "learning_rate": 0.0001273076564115769,
+      "loss": 0.6561,
+      "step": 1074
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.35581085943841284,
+      "learning_rate": 0.0001271830072236343,
+      "loss": 0.672,
+      "step": 1075
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.32045712249396435,
+      "learning_rate": 0.00012705831241375694,
+      "loss": 0.6231,
+      "step": 1076
+    },
+    {
+      "epoch": 0.4308,
+      "grad_norm": 0.34139655052392703,
+      "learning_rate": 0.0001269335721912233,
+      "loss": 0.677,
+      "step": 1077
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.3643977216109273,
+      "learning_rate": 0.00012680878676538804,
+      "loss": 0.6689,
+      "step": 1078
+    },
+    {
+      "epoch": 0.4316,
+      "grad_norm": 0.3660374205847108,
+      "learning_rate": 0.00012668395634568176,
+      "loss": 0.6489,
+      "step": 1079
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3368196263255544,
+      "learning_rate": 0.0001265590811416105,
+      "loss": 0.6129,
+      "step": 1080
+    },
+    {
+      "epoch": 0.4324,
+      "grad_norm": 0.3657687843961985,
+      "learning_rate": 0.00012643416136275557,
+      "loss": 0.6401,
+      "step": 1081
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.3448187133801033,
+      "learning_rate": 0.00012630919721877298,
+      "loss": 0.6328,
+      "step": 1082
+    },
+    {
+      "epoch": 0.4332,
+      "grad_norm": 0.35717875019015266,
+      "learning_rate": 0.0001261841889193932,
+      "loss": 0.6559,
+      "step": 1083
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.3685023753029261,
+      "learning_rate": 0.00012605913667442095,
+      "loss": 0.6799,
+      "step": 1084
+    },
+    {
+      "epoch": 0.434,
+      "grad_norm": 0.36087411813898496,
+      "learning_rate": 0.0001259340406937345,
+      "loss": 0.6428,
+      "step": 1085
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.37381189014081895,
+      "learning_rate": 0.00012580890118728572,
+      "loss": 0.6423,
+      "step": 1086
+    },
+    {
+      "epoch": 0.4348,
+      "grad_norm": 0.351860060042875,
+      "learning_rate": 0.00012568371836509936,
+      "loss": 0.6334,
+      "step": 1087
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.31926815801438274,
+      "learning_rate": 0.00012555849243727299,
+      "loss": 0.615,
+      "step": 1088
+    },
+    {
+      "epoch": 0.4356,
+      "grad_norm": 0.3232317581728528,
+      "learning_rate": 0.00012543322361397647,
+      "loss": 0.6696,
+      "step": 1089
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.3420996415534882,
+      "learning_rate": 0.00012530791210545162,
+      "loss": 0.6164,
+      "step": 1090
+    },
+    {
+      "epoch": 0.4364,
+      "grad_norm": 0.38207290025601376,
+      "learning_rate": 0.00012518255812201203,
+      "loss": 0.6368,
+      "step": 1091
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.33912036701999765,
+      "learning_rate": 0.00012505716187404241,
+      "loss": 0.6518,
+      "step": 1092
+    },
+    {
+      "epoch": 0.4372,
+      "grad_norm": 0.36194506538681326,
+      "learning_rate": 0.00012493172357199857,
+      "loss": 0.698,
+      "step": 1093
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.34036246217276134,
+      "learning_rate": 0.00012480624342640673,
+      "loss": 0.6231,
+      "step": 1094
+    },
+    {
+      "epoch": 0.438,
+      "grad_norm": 0.3343319436104641,
+      "learning_rate": 0.0001246807216478634,
+      "loss": 0.6036,
+      "step": 1095
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.35032259157872203,
+      "learning_rate": 0.0001245551584470351,
+      "loss": 0.6167,
+      "step": 1096
+    },
+    {
+      "epoch": 0.4388,
+      "grad_norm": 0.4146810033641503,
+      "learning_rate": 0.00012442955403465768,
+      "loss": 0.6654,
+      "step": 1097
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.35832129820145203,
+      "learning_rate": 0.00012430390862153625,
+      "loss": 0.6339,
+      "step": 1098
+    },
+    {
+      "epoch": 0.4396,
+      "grad_norm": 0.358377151169997,
+      "learning_rate": 0.00012417822241854467,
+      "loss": 0.6366,
+      "step": 1099
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.36373121113996665,
+      "learning_rate": 0.00012405249563662537,
+      "loss": 0.6565,
+      "step": 1100
+    },
+    {
+      "epoch": 0.4404,
+      "grad_norm": 0.3594541755442927,
+      "learning_rate": 0.00012392672848678877,
+      "loss": 0.6997,
+      "step": 1101
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.3629727624468758,
+      "learning_rate": 0.0001238009211801131,
+      "loss": 0.7129,
+      "step": 1102
+    },
+    {
+      "epoch": 0.4412,
+      "grad_norm": 0.3350770583209822,
+      "learning_rate": 0.00012367507392774398,
+      "loss": 0.6296,
+      "step": 1103
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.35016548464672453,
+      "learning_rate": 0.00012354918694089406,
+      "loss": 0.6335,
+      "step": 1104
+    },
+    {
+      "epoch": 0.442,
+      "grad_norm": 0.3519380630599524,
+      "learning_rate": 0.00012342326043084266,
+      "loss": 0.6988,
+      "step": 1105
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.35177244271116015,
+      "learning_rate": 0.00012329729460893552,
+      "loss": 0.6384,
+      "step": 1106
+    },
+    {
+      "epoch": 0.4428,
+      "grad_norm": 0.35467429467861333,
+      "learning_rate": 0.00012317128968658425,
+      "loss": 0.6653,
+      "step": 1107
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.3589107500094258,
+      "learning_rate": 0.0001230452458752661,
+      "loss": 0.6427,
+      "step": 1108
+    },
+    {
+      "epoch": 0.4436,
+      "grad_norm": 0.34188766267721576,
+      "learning_rate": 0.00012291916338652364,
+      "loss": 0.6229,
+      "step": 1109
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.3337032531326285,
+      "learning_rate": 0.00012279304243196436,
+      "loss": 0.6307,
+      "step": 1110
+    },
+    {
+      "epoch": 0.4444,
+      "grad_norm": 0.3375032722495382,
+      "learning_rate": 0.00012266688322326024,
+      "loss": 0.6475,
+      "step": 1111
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3611670117129286,
+      "learning_rate": 0.0001225406859721475,
+      "loss": 0.65,
+      "step": 1112
+    },
+    {
+      "epoch": 0.4452,
+      "grad_norm": 0.3616691548735018,
+      "learning_rate": 0.00012241445089042623,
+      "loss": 0.6473,
+      "step": 1113
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.37326942100056826,
+      "learning_rate": 0.00012228817818995996,
+      "loss": 0.6276,
+      "step": 1114
+    },
+    {
+      "epoch": 0.446,
+      "grad_norm": 0.3482463645852628,
+      "learning_rate": 0.00012216186808267546,
+      "loss": 0.6567,
+      "step": 1115
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3525700826952431,
+      "learning_rate": 0.00012203552078056209,
+      "loss": 0.6498,
+      "step": 1116
+    },
+    {
+      "epoch": 0.4468,
+      "grad_norm": 0.37206347910603454,
+      "learning_rate": 0.00012190913649567184,
+      "loss": 0.6466,
+      "step": 1117
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.36237934031288455,
+      "learning_rate": 0.00012178271544011863,
+      "loss": 0.6255,
+      "step": 1118
+    },
+    {
+      "epoch": 0.4476,
+      "grad_norm": 0.3515339387422734,
+      "learning_rate": 0.00012165625782607817,
+      "loss": 0.5916,
+      "step": 1119
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3689400504529256,
+      "learning_rate": 0.0001215297638657875,
+      "loss": 0.6404,
+      "step": 1120
+    },
+    {
+      "epoch": 0.4484,
+      "grad_norm": 0.34157562776012124,
+      "learning_rate": 0.00012140323377154466,
+      "loss": 0.6106,
+      "step": 1121
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.368410902246951,
+      "learning_rate": 0.00012127666775570836,
+      "loss": 0.6499,
+      "step": 1122
+    },
+    {
+      "epoch": 0.4492,
+      "grad_norm": 0.3539844358616864,
+      "learning_rate": 0.0001211500660306975,
+      "loss": 0.6585,
+      "step": 1123
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3673415854540668,
+      "learning_rate": 0.00012102342880899109,
+      "loss": 0.7017,
+      "step": 1124
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.3518001322298715,
+      "learning_rate": 0.00012089675630312754,
+      "loss": 0.6741,
+      "step": 1125
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.3360602890643158,
+      "learning_rate": 0.00012077004872570454,
+      "loss": 0.5728,
+      "step": 1126
+    },
+    {
+      "epoch": 0.4508,
+      "grad_norm": 0.3458319940972609,
+      "learning_rate": 0.0001206433062893787,
+      "loss": 0.6127,
+      "step": 1127
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3713090287879673,
+      "learning_rate": 0.00012051652920686505,
+      "loss": 0.6804,
+      "step": 1128
+    },
+    {
+      "epoch": 0.4516,
+      "grad_norm": 0.38617354952982746,
+      "learning_rate": 0.00012038971769093686,
+      "loss": 0.6677,
+      "step": 1129
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.3524726737540465,
+      "learning_rate": 0.00012026287195442503,
+      "loss": 0.6201,
+      "step": 1130
+    },
+    {
+      "epoch": 0.4524,
+      "grad_norm": 0.3484102447747805,
+      "learning_rate": 0.0001201359922102181,
+      "loss": 0.6489,
+      "step": 1131
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.33740052898910877,
+      "learning_rate": 0.0001200090786712615,
+      "loss": 0.6275,
+      "step": 1132
+    },
+    {
+      "epoch": 0.4532,
+      "grad_norm": 0.33438727982873373,
+      "learning_rate": 0.00011988213155055754,
+      "loss": 0.6378,
+      "step": 1133
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.3631159101354578,
+      "learning_rate": 0.00011975515106116472,
+      "loss": 0.6554,
+      "step": 1134
+    },
+    {
+      "epoch": 0.454,
+      "grad_norm": 0.33194697447543714,
+      "learning_rate": 0.00011962813741619777,
+      "loss": 0.6633,
+      "step": 1135
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.36456216988924117,
+      "learning_rate": 0.00011950109082882681,
+      "loss": 0.6077,
+      "step": 1136
+    },
+    {
+      "epoch": 0.4548,
+      "grad_norm": 0.3834217067690752,
+      "learning_rate": 0.0001193740115122774,
+      "loss": 0.6044,
+      "step": 1137
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.35361345763259006,
+      "learning_rate": 0.00011924689967983006,
+      "loss": 0.6219,
+      "step": 1138
+    },
+    {
+      "epoch": 0.4556,
+      "grad_norm": 0.34553967278756625,
+      "learning_rate": 0.00011911975554481971,
+      "loss": 0.638,
+      "step": 1139
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3836534901465859,
+      "learning_rate": 0.0001189925793206357,
+      "loss": 0.6969,
+      "step": 1140
+    },
+    {
+      "epoch": 0.4564,
+      "grad_norm": 0.3638036872842273,
+      "learning_rate": 0.00011886537122072105,
+      "loss": 0.6725,
+      "step": 1141
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.39978315045217133,
+      "learning_rate": 0.00011873813145857249,
+      "loss": 0.6513,
+      "step": 1142
+    },
+    {
+      "epoch": 0.4572,
+      "grad_norm": 0.3806959135931955,
+      "learning_rate": 0.00011861086024773962,
+      "loss": 0.679,
+      "step": 1143
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.35341600190389044,
+      "learning_rate": 0.000118483557801825,
+      "loss": 0.6578,
+      "step": 1144
+    },
+    {
+      "epoch": 0.458,
+      "grad_norm": 0.3563699363910567,
+      "learning_rate": 0.00011835622433448361,
+      "loss": 0.6471,
+      "step": 1145
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.33067674651689005,
+      "learning_rate": 0.00011822886005942244,
+      "loss": 0.6387,
+      "step": 1146
+    },
+    {
+      "epoch": 0.4588,
+      "grad_norm": 0.35904413352647985,
+      "learning_rate": 0.00011810146519040021,
+      "loss": 0.6727,
+      "step": 1147
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.6630529372769212,
+      "learning_rate": 0.00011797403994122698,
+      "loss": 0.6404,
+      "step": 1148
+    },
+    {
+      "epoch": 0.4596,
+      "grad_norm": 0.32328732899515183,
+      "learning_rate": 0.00011784658452576378,
+      "loss": 0.6218,
+      "step": 1149
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.3674146730299686,
+      "learning_rate": 0.0001177190991579223,
+      "loss": 0.6411,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4604,
+      "grad_norm": 0.3464141037597477,
+      "learning_rate": 0.00011759158405166446,
+      "loss": 0.6021,
+      "step": 1151
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3462192086231998,
+      "learning_rate": 0.00011746403942100215,
+      "loss": 0.6465,
+      "step": 1152
+    },
+    {
+      "epoch": 0.4612,
+      "grad_norm": 0.3373173125544461,
+      "learning_rate": 0.00011733646547999677,
+      "loss": 0.5826,
+      "step": 1153
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.35866904219362455,
+      "learning_rate": 0.00011720886244275893,
+      "loss": 0.639,
+      "step": 1154
+    },
+    {
+      "epoch": 0.462,
+      "grad_norm": 0.3731033563035224,
+      "learning_rate": 0.00011708123052344804,
+      "loss": 0.613,
+      "step": 1155
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3693917713517358,
+      "learning_rate": 0.00011695356993627202,
+      "loss": 0.6582,
+      "step": 1156
+    },
+    {
+      "epoch": 0.4628,
+      "grad_norm": 0.3527565389122925,
+      "learning_rate": 0.00011682588089548692,
+      "loss": 0.643,
+      "step": 1157
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.4065757328066187,
+      "learning_rate": 0.00011669816361539647,
+      "loss": 0.6243,
+      "step": 1158
+    },
+    {
+      "epoch": 0.4636,
+      "grad_norm": 0.3401808120681071,
+      "learning_rate": 0.00011657041831035184,
+      "loss": 0.6475,
+      "step": 1159
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.33491804020140525,
+      "learning_rate": 0.0001164426451947513,
+      "loss": 0.5667,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4644,
+      "grad_norm": 0.3509247945129264,
+      "learning_rate": 0.00011631484448303965,
+      "loss": 0.6602,
+      "step": 1161
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.3448036143304661,
+      "learning_rate": 0.00011618701638970814,
+      "loss": 0.6396,
+      "step": 1162
+    },
+    {
+      "epoch": 0.4652,
+      "grad_norm": 0.35217793454086344,
+      "learning_rate": 0.00011605916112929388,
+      "loss": 0.6358,
+      "step": 1163
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4157376625846788,
+      "learning_rate": 0.00011593127891637967,
+      "loss": 0.6287,
+      "step": 1164
+    },
+    {
+      "epoch": 0.466,
+      "grad_norm": 0.3518045641667168,
+      "learning_rate": 0.00011580336996559343,
+      "loss": 0.6457,
+      "step": 1165
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.3851283507513587,
+      "learning_rate": 0.00011567543449160809,
+      "loss": 0.6561,
+      "step": 1166
+    },
+    {
+      "epoch": 0.4668,
+      "grad_norm": 0.3467525558653285,
+      "learning_rate": 0.00011554747270914097,
+      "loss": 0.6543,
+      "step": 1167
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.36874210413197406,
+      "learning_rate": 0.00011541948483295357,
+      "loss": 0.6428,
+      "step": 1168
+    },
+    {
+      "epoch": 0.4676,
+      "grad_norm": 0.35625172641639313,
+      "learning_rate": 0.00011529147107785128,
+      "loss": 0.6327,
+      "step": 1169
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.3391724767182548,
+      "learning_rate": 0.00011516343165868279,
+      "loss": 0.6185,
+      "step": 1170
+    },
+    {
+      "epoch": 0.4684,
+      "grad_norm": 0.3732951354423005,
+      "learning_rate": 0.00011503536679033999,
+      "loss": 0.6607,
+      "step": 1171
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.33778442308236173,
+      "learning_rate": 0.00011490727668775733,
+      "loss": 0.6351,
+      "step": 1172
+    },
+    {
+      "epoch": 0.4692,
+      "grad_norm": 0.3597063141045392,
+      "learning_rate": 0.00011477916156591179,
+      "loss": 0.6181,
+      "step": 1173
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.36179807923506846,
+      "learning_rate": 0.00011465102163982217,
+      "loss": 0.6746,
+      "step": 1174
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.3330308143333079,
+      "learning_rate": 0.00011452285712454904,
+      "loss": 0.5767,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.35813905476401037,
+      "learning_rate": 0.00011439466823519414,
+      "loss": 0.6646,
+      "step": 1176
+    },
+    {
+      "epoch": 0.4708,
+      "grad_norm": 0.3450569829700074,
+      "learning_rate": 0.00011426645518690016,
+      "loss": 0.6575,
+      "step": 1177
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.4133442723018541,
+      "learning_rate": 0.00011413821819485035,
+      "loss": 0.6522,
+      "step": 1178
+    },
+    {
+      "epoch": 0.4716,
+      "grad_norm": 0.34402182080967436,
+      "learning_rate": 0.00011400995747426811,
+      "loss": 0.5987,
+      "step": 1179
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.35033698500174304,
+      "learning_rate": 0.00011388167324041669,
+      "loss": 0.6686,
+      "step": 1180
+    },
+    {
+      "epoch": 0.4724,
+      "grad_norm": 0.36188935714893783,
+      "learning_rate": 0.00011375336570859876,
+      "loss": 0.6262,
+      "step": 1181
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.3719189511733754,
+      "learning_rate": 0.00011362503509415619,
+      "loss": 0.6258,
+      "step": 1182
+    },
+    {
+      "epoch": 0.4732,
+      "grad_norm": 0.3569183083026708,
+      "learning_rate": 0.00011349668161246944,
+      "loss": 0.6232,
+      "step": 1183
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.34895691234867987,
+      "learning_rate": 0.00011336830547895752,
+      "loss": 0.6486,
+      "step": 1184
+    },
+    {
+      "epoch": 0.474,
+      "grad_norm": 0.33847296120051706,
+      "learning_rate": 0.00011323990690907733,
+      "loss": 0.6376,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.3564539231903446,
+      "learning_rate": 0.00011311148611832345,
+      "loss": 0.6851,
+      "step": 1186
+    },
+    {
+      "epoch": 0.4748,
+      "grad_norm": 0.37641134678983584,
+      "learning_rate": 0.0001129830433222278,
+      "loss": 0.6495,
+      "step": 1187
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3483141808653871,
+      "learning_rate": 0.00011285457873635921,
+      "loss": 0.642,
+      "step": 1188
+    },
+    {
+      "epoch": 0.4756,
+      "grad_norm": 0.3628071073948621,
+      "learning_rate": 0.00011272609257632305,
+      "loss": 0.6341,
+      "step": 1189
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.3565896734004511,
+      "learning_rate": 0.00011259758505776092,
+      "loss": 0.6375,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4764,
+      "grad_norm": 0.34875490237998463,
+      "learning_rate": 0.00011246905639635029,
+      "loss": 0.6194,
+      "step": 1191
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3528192238282489,
+      "learning_rate": 0.00011234050680780406,
+      "loss": 0.622,
+      "step": 1192
+    },
+    {
+      "epoch": 0.4772,
+      "grad_norm": 0.33711625222077923,
+      "learning_rate": 0.00011221193650787032,
+      "loss": 0.6242,
+      "step": 1193
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.36364403618449903,
+      "learning_rate": 0.00011208334571233185,
+      "loss": 0.6329,
+      "step": 1194
+    },
+    {
+      "epoch": 0.478,
+      "grad_norm": 0.3458245750874432,
+      "learning_rate": 0.0001119547346370059,
+      "loss": 0.6953,
+      "step": 1195
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.35316328695100874,
+      "learning_rate": 0.0001118261034977437,
+      "loss": 0.6158,
+      "step": 1196
+    },
+    {
+      "epoch": 0.4788,
+      "grad_norm": 0.3682412673076238,
+      "learning_rate": 0.00011169745251043021,
+      "loss": 0.7115,
+      "step": 1197
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.33913215904512334,
+      "learning_rate": 0.00011156878189098356,
+      "loss": 0.6328,
+      "step": 1198
+    },
+    {
+      "epoch": 0.4796,
+      "grad_norm": 0.34887865709752547,
+      "learning_rate": 0.00011144009185535509,
+      "loss": 0.6243,
+      "step": 1199
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3663865307807514,
+      "learning_rate": 0.00011131138261952845,
+      "loss": 0.6191,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4804,
+      "grad_norm": 0.3370232501645842,
+      "learning_rate": 0.00011118265439951967,
+      "loss": 0.5982,
+      "step": 1201
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.35896060343046443,
+      "learning_rate": 0.0001110539074113766,
+      "loss": 0.6726,
+      "step": 1202
+    },
+    {
+      "epoch": 0.4812,
+      "grad_norm": 0.3301155377914508,
+      "learning_rate": 0.00011092514187117864,
+      "loss": 0.6175,
+      "step": 1203
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3419130759760223,
+      "learning_rate": 0.00011079635799503624,
+      "loss": 0.6229,
+      "step": 1204
+    },
+    {
+      "epoch": 0.482,
+      "grad_norm": 0.3548958405548371,
+      "learning_rate": 0.00011066755599909064,
+      "loss": 0.6612,
+      "step": 1205
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.33624645660686237,
+      "learning_rate": 0.00011053873609951362,
+      "loss": 0.652,
+      "step": 1206
+    },
+    {
+      "epoch": 0.4828,
+      "grad_norm": 0.3594937632014105,
+      "learning_rate": 0.00011040989851250678,
+      "loss": 0.6674,
+      "step": 1207
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3513060841937797,
+      "learning_rate": 0.00011028104345430161,
+      "loss": 0.6305,
+      "step": 1208
+    },
+    {
+      "epoch": 0.4836,
+      "grad_norm": 0.34984900178680145,
+      "learning_rate": 0.00011015217114115883,
+      "loss": 0.6737,
+      "step": 1209
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.34433824279802844,
+      "learning_rate": 0.00011002328178936811,
+      "loss": 0.636,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4844,
+      "grad_norm": 0.35542548151075953,
+      "learning_rate": 0.00010989437561524776,
+      "loss": 0.6236,
+      "step": 1211
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.33325446198619896,
+      "learning_rate": 0.0001097654528351443,
+      "loss": 0.5909,
+      "step": 1212
+    },
+    {
+      "epoch": 0.4852,
+      "grad_norm": 0.3737512950007258,
+      "learning_rate": 0.00010963651366543213,
+      "loss": 0.615,
+      "step": 1213
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.34473431451785225,
+      "learning_rate": 0.0001095075583225131,
+      "loss": 0.6144,
+      "step": 1214
+    },
+    {
+      "epoch": 0.486,
+      "grad_norm": 0.36014593259355815,
+      "learning_rate": 0.00010937858702281631,
+      "loss": 0.6516,
+      "step": 1215
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3862404785487015,
+      "learning_rate": 0.00010924959998279753,
+      "loss": 0.6791,
+      "step": 1216
+    },
+    {
+      "epoch": 0.4868,
+      "grad_norm": 0.3625573244343477,
+      "learning_rate": 0.00010912059741893908,
+      "loss": 0.6865,
+      "step": 1217
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.3289197732318271,
+      "learning_rate": 0.00010899157954774919,
+      "loss": 0.6207,
+      "step": 1218
+    },
+    {
+      "epoch": 0.4876,
+      "grad_norm": 0.3600277694995468,
+      "learning_rate": 0.00010886254658576184,
+      "loss": 0.6383,
+      "step": 1219
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.3543077565948878,
+      "learning_rate": 0.0001087334987495364,
+      "loss": 0.6723,
+      "step": 1220
+    },
+    {
+      "epoch": 0.4884,
+      "grad_norm": 0.3309490300200102,
+      "learning_rate": 0.0001086044362556571,
+      "loss": 0.6172,
+      "step": 1221
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.366905751195663,
+      "learning_rate": 0.00010847535932073287,
+      "loss": 0.681,
+      "step": 1222
+    },
+    {
+      "epoch": 0.4892,
+      "grad_norm": 0.3968664880824747,
+      "learning_rate": 0.00010834626816139677,
+      "loss": 0.6659,
+      "step": 1223
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3587325502034165,
+      "learning_rate": 0.00010821716299430578,
+      "loss": 0.6258,
+      "step": 1224
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.3485719603322062,
+      "learning_rate": 0.00010808804403614043,
+      "loss": 0.6643,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.3484885283904212,
+      "learning_rate": 0.00010795891150360435,
+      "loss": 0.6372,
+      "step": 1226
+    },
+    {
+      "epoch": 0.4908,
+      "grad_norm": 0.3678839256648748,
+      "learning_rate": 0.00010782976561342398,
+      "loss": 0.6328,
+      "step": 1227
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.33509059890827697,
+      "learning_rate": 0.00010770060658234815,
+      "loss": 0.6262,
+      "step": 1228
+    },
+    {
+      "epoch": 0.4916,
+      "grad_norm": 0.33251123198984384,
+      "learning_rate": 0.00010757143462714777,
+      "loss": 0.6462,
+      "step": 1229
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.35032556086450606,
+      "learning_rate": 0.0001074422499646154,
+      "loss": 0.6057,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4924,
+      "grad_norm": 0.3744452549493245,
+      "learning_rate": 0.00010731305281156498,
+      "loss": 0.6257,
+      "step": 1231
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3270291197362055,
+      "learning_rate": 0.00010718384338483141,
+      "loss": 0.6118,
+      "step": 1232
+    },
+    {
+      "epoch": 0.4932,
+      "grad_norm": 0.34676008033704087,
+      "learning_rate": 0.00010705462190127011,
+      "loss": 0.629,
+      "step": 1233
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.3663047890903659,
+      "learning_rate": 0.00010692538857775684,
+      "loss": 0.6203,
+      "step": 1234
+    },
+    {
+      "epoch": 0.494,
+      "grad_norm": 0.3272336309368646,
+      "learning_rate": 0.00010679614363118717,
+      "loss": 0.5978,
+      "step": 1235
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.35591471231657873,
+      "learning_rate": 0.00010666688727847621,
+      "loss": 0.6532,
+      "step": 1236
+    },
+    {
+      "epoch": 0.4948,
+      "grad_norm": 0.3452362011658614,
+      "learning_rate": 0.00010653761973655819,
+      "loss": 0.6126,
+      "step": 1237
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.3503817246383713,
+      "learning_rate": 0.00010640834122238606,
+      "loss": 0.6549,
+      "step": 1238
+    },
+    {
+      "epoch": 0.4956,
+      "grad_norm": 0.35621375880558626,
+      "learning_rate": 0.00010627905195293135,
+      "loss": 0.6651,
+      "step": 1239
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3757460056453115,
+      "learning_rate": 0.0001061497521451835,
+      "loss": 0.6685,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4964,
+      "grad_norm": 0.3560824278739074,
+      "learning_rate": 0.00010602044201614965,
+      "loss": 0.6177,
+      "step": 1241
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.3426459162810895,
+      "learning_rate": 0.00010589112178285432,
+      "loss": 0.5979,
+      "step": 1242
+    },
+    {
+      "epoch": 0.4972,
+      "grad_norm": 0.3424236595236985,
+      "learning_rate": 0.00010576179166233895,
+      "loss": 0.6452,
+      "step": 1243
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.37326455403865544,
+      "learning_rate": 0.0001056324518716616,
+      "loss": 0.6839,
+      "step": 1244
+    },
+    {
+      "epoch": 0.498,
+      "grad_norm": 0.3447475867572372,
+      "learning_rate": 0.00010550310262789649,
+      "loss": 0.669,
+      "step": 1245
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.3645476627658833,
+      "learning_rate": 0.00010537374414813383,
+      "loss": 0.648,
+      "step": 1246
+    },
+    {
+      "epoch": 0.4988,
+      "grad_norm": 0.32084533765043544,
+      "learning_rate": 0.00010524437664947917,
+      "loss": 0.5702,
+      "step": 1247
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3639071006713642,
+      "learning_rate": 0.0001051150003490534,
+      "loss": 0.6801,
+      "step": 1248
+    },
+    {
+      "epoch": 0.4996,
+      "grad_norm": 0.4188043715421482,
+      "learning_rate": 0.00010498561546399193,
+      "loss": 0.6284,
+      "step": 1249
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.35399792562477883,
+      "learning_rate": 0.00010485622221144484,
+      "loss": 0.6328,
+      "step": 1250
+    },
+    {
+      "epoch": 0.5004,
+      "grad_norm": 0.3374870043789103,
+      "learning_rate": 0.00010472682080857606,
+      "loss": 0.6291,
+      "step": 1251
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.33333080790463976,
+      "learning_rate": 0.00010459741147256326,
+      "loss": 0.61,
+      "step": 1252
+    },
+    {
+      "epoch": 0.5012,
+      "grad_norm": 0.3454834033242172,
+      "learning_rate": 0.00010446799442059749,
+      "loss": 0.5589,
+      "step": 1253
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.34029312366152303,
+      "learning_rate": 0.0001043385698698826,
+      "loss": 0.6486,
+      "step": 1254
+    },
+    {
+      "epoch": 0.502,
+      "grad_norm": 0.3540017630496081,
+      "learning_rate": 0.00010420913803763521,
+      "loss": 0.6303,
+      "step": 1255
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3488929724271329,
+      "learning_rate": 0.00010407969914108399,
+      "loss": 0.6315,
+      "step": 1256
+    },
+    {
+      "epoch": 0.5028,
+      "grad_norm": 0.35884525176733456,
+      "learning_rate": 0.00010395025339746964,
+      "loss": 0.5907,
+      "step": 1257
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.3254700117467145,
+      "learning_rate": 0.00010382080102404417,
+      "loss": 0.5824,
+      "step": 1258
+    },
+    {
+      "epoch": 0.5036,
+      "grad_norm": 0.3487802649810632,
+      "learning_rate": 0.00010369134223807082,
+      "loss": 0.5929,
+      "step": 1259
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.34198688144213285,
+      "learning_rate": 0.00010356187725682359,
+      "loss": 0.6582,
+      "step": 1260
+    },
+    {
+      "epoch": 0.5044,
+      "grad_norm": 0.35645009121312554,
+      "learning_rate": 0.00010343240629758684,
+      "loss": 0.6518,
+      "step": 1261
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.3708941180601143,
+      "learning_rate": 0.00010330292957765501,
+      "loss": 0.675,
+      "step": 1262
+    },
+    {
+      "epoch": 0.5052,
+      "grad_norm": 0.3353731136131598,
+      "learning_rate": 0.00010317344731433216,
+      "loss": 0.5948,
+      "step": 1263
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3469344087220699,
+      "learning_rate": 0.00010304395972493172,
+      "loss": 0.6692,
+      "step": 1264
+    },
+    {
+      "epoch": 0.506,
+      "grad_norm": 0.3609414613558415,
+      "learning_rate": 0.00010291446702677599,
+      "loss": 0.6696,
+      "step": 1265
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.3427685116161911,
+      "learning_rate": 0.00010278496943719584,
+      "loss": 0.6261,
+      "step": 1266
+    },
+    {
+      "epoch": 0.5068,
+      "grad_norm": 0.3392486050691237,
+      "learning_rate": 0.00010265546717353041,
+      "loss": 0.6241,
+      "step": 1267
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3361004815479842,
+      "learning_rate": 0.00010252596045312666,
+      "loss": 0.5729,
+      "step": 1268
+    },
+    {
+      "epoch": 0.5076,
+      "grad_norm": 0.3463110258286205,
+      "learning_rate": 0.000102396449493339,
+      "loss": 0.6744,
+      "step": 1269
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.3515275610021229,
+      "learning_rate": 0.000102266934511529,
+      "loss": 0.6336,
+      "step": 1270
+    },
+    {
+      "epoch": 0.5084,
+      "grad_norm": 0.33174849902873876,
+      "learning_rate": 0.00010213741572506497,
+      "loss": 0.6264,
+      "step": 1271
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.33455758686453246,
+      "learning_rate": 0.00010200789335132158,
+      "loss": 0.586,
+      "step": 1272
+    },
+    {
+      "epoch": 0.5092,
+      "grad_norm": 0.3545808144691893,
+      "learning_rate": 0.00010187836760767953,
+      "loss": 0.6328,
+      "step": 1273
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.36242011393728973,
+      "learning_rate": 0.00010174883871152516,
+      "loss": 0.6553,
+      "step": 1274
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.34778223748948844,
+      "learning_rate": 0.00010161930688025017,
+      "loss": 0.6424,
+      "step": 1275
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3359680423864884,
+      "learning_rate": 0.0001014897723312511,
+      "loss": 0.6044,
+      "step": 1276
+    },
+    {
+      "epoch": 0.5108,
+      "grad_norm": 0.35534263644066355,
+      "learning_rate": 0.0001013602352819291,
+      "loss": 0.6757,
+      "step": 1277
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.3494413570544863,
+      "learning_rate": 0.00010123069594968952,
+      "loss": 0.6295,
+      "step": 1278
+    },
+    {
+      "epoch": 0.5116,
+      "grad_norm": 0.35573065101587703,
+      "learning_rate": 0.00010110115455194156,
+      "loss": 0.6753,
+      "step": 1279
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.33097794748432213,
+      "learning_rate": 0.00010097161130609773,
+      "loss": 0.6093,
+      "step": 1280
+    },
+    {
+      "epoch": 0.5124,
+      "grad_norm": 0.35570883664507547,
+      "learning_rate": 0.00010084206642957393,
+      "loss": 0.6317,
+      "step": 1281
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.35106539136984977,
+      "learning_rate": 0.0001007125201397885,
+      "loss": 0.614,
+      "step": 1282
+    },
+    {
+      "epoch": 0.5132,
+      "grad_norm": 0.34128861680097217,
+      "learning_rate": 0.00010058297265416234,
+      "loss": 0.592,
+      "step": 1283
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.342487405813307,
+      "learning_rate": 0.00010045342419011832,
+      "loss": 0.6505,
+      "step": 1284
+    },
+    {
+      "epoch": 0.514,
+      "grad_norm": 0.34305231840107203,
+      "learning_rate": 0.00010032387496508089,
+      "loss": 0.6369,
+      "step": 1285
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.33378806377069,
+      "learning_rate": 0.00010019432519647585,
+      "loss": 0.6445,
+      "step": 1286
+    },
+    {
+      "epoch": 0.5148,
+      "grad_norm": 0.3375143627285051,
+      "learning_rate": 0.00010006477510172985,
+      "loss": 0.6599,
+      "step": 1287
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.35097006621776994,
+      "learning_rate": 9.993522489827016e-05,
+      "loss": 0.6107,
+      "step": 1288
+    },
+    {
+      "epoch": 0.5156,
+      "grad_norm": 0.33587296058662613,
+      "learning_rate": 9.980567480352416e-05,
+      "loss": 0.6303,
+      "step": 1289
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.35666680105213716,
+      "learning_rate": 9.967612503491914e-05,
+      "loss": 0.651,
+      "step": 1290
+    },
+    {
+      "epoch": 0.5164,
+      "grad_norm": 0.33545182714986216,
+      "learning_rate": 9.954657580988172e-05,
+      "loss": 0.6459,
+      "step": 1291
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.34485682456959477,
+      "learning_rate": 9.94170273458377e-05,
+      "loss": 0.6547,
+      "step": 1292
+    },
+    {
+      "epoch": 0.5172,
+      "grad_norm": 0.343013812873406,
+      "learning_rate": 9.928747986021152e-05,
+      "loss": 0.6334,
+      "step": 1293
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.34384390758781147,
+      "learning_rate": 9.91579335704261e-05,
+      "loss": 0.63,
+      "step": 1294
+    },
+    {
+      "epoch": 0.518,
+      "grad_norm": 0.3234300874086102,
+      "learning_rate": 9.902838869390229e-05,
+      "loss": 0.5909,
+      "step": 1295
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3268275950873985,
+      "learning_rate": 9.88988454480585e-05,
+      "loss": 0.5996,
+      "step": 1296
+    },
+    {
+      "epoch": 0.5188,
+      "grad_norm": 0.3597660984295147,
+      "learning_rate": 9.876930405031047e-05,
+      "loss": 0.6504,
+      "step": 1297
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.35453593691446306,
+      "learning_rate": 9.863976471807089e-05,
+      "loss": 0.6323,
+      "step": 1298
+    },
+    {
+      "epoch": 0.5196,
+      "grad_norm": 0.3531127558340611,
+      "learning_rate": 9.851022766874893e-05,
+      "loss": 0.6227,
+      "step": 1299
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.34845680554253494,
+      "learning_rate": 9.838069311974986e-05,
+      "loss": 0.5916,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5204,
+      "grad_norm": 0.33831499539595655,
+      "learning_rate": 9.825116128847488e-05,
+      "loss": 0.6246,
+      "step": 1301
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.33747552361821015,
+      "learning_rate": 9.812163239232051e-05,
+      "loss": 0.6098,
+      "step": 1302
+    },
+    {
+      "epoch": 0.5212,
+      "grad_norm": 0.3487999262928088,
+      "learning_rate": 9.799210664867843e-05,
+      "loss": 0.6142,
+      "step": 1303
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3519251218931331,
+      "learning_rate": 9.786258427493505e-05,
+      "loss": 0.6087,
+      "step": 1304
+    },
+    {
+      "epoch": 0.522,
+      "grad_norm": 0.3654639303417923,
+      "learning_rate": 9.7733065488471e-05,
+      "loss": 0.6872,
+      "step": 1305
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.3684929790585567,
+      "learning_rate": 9.760355050666102e-05,
+      "loss": 0.6679,
+      "step": 1306
+    },
+    {
+      "epoch": 0.5228,
+      "grad_norm": 0.34737766133230247,
+      "learning_rate": 9.747403954687334e-05,
+      "loss": 0.5503,
+      "step": 1307
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.3385619913992404,
+      "learning_rate": 9.734453282646961e-05,
+      "loss": 0.624,
+      "step": 1308
+    },
+    {
+      "epoch": 0.5236,
+      "grad_norm": 0.3375394800263621,
+      "learning_rate": 9.721503056280418e-05,
+      "loss": 0.6492,
+      "step": 1309
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.4104809079299157,
+      "learning_rate": 9.708553297322406e-05,
+      "loss": 0.6269,
+      "step": 1310
+    },
+    {
+      "epoch": 0.5244,
+      "grad_norm": 0.34047952869390213,
+      "learning_rate": 9.695604027506829e-05,
+      "loss": 0.6216,
+      "step": 1311
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.32997830779488657,
+      "learning_rate": 9.682655268566783e-05,
+      "loss": 0.6367,
+      "step": 1312
+    },
+    {
+      "epoch": 0.5252,
+      "grad_norm": 0.3502591733179358,
+      "learning_rate": 9.669707042234501e-05,
+      "loss": 0.5962,
+      "step": 1313
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.36226227435034725,
+      "learning_rate": 9.656759370241319e-05,
+      "loss": 0.6322,
+      "step": 1314
+    },
+    {
+      "epoch": 0.526,
+      "grad_norm": 0.36136848736690047,
+      "learning_rate": 9.643812274317644e-05,
+      "loss": 0.701,
+      "step": 1315
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.33275918560578055,
+      "learning_rate": 9.630865776192918e-05,
+      "loss": 0.6091,
+      "step": 1316
+    },
+    {
+      "epoch": 0.5268,
+      "grad_norm": 0.3504721176055122,
+      "learning_rate": 9.617919897595586e-05,
+      "loss": 0.6337,
+      "step": 1317
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.3861122537747958,
+      "learning_rate": 9.604974660253037e-05,
+      "loss": 0.6548,
+      "step": 1318
+    },
+    {
+      "epoch": 0.5276,
+      "grad_norm": 0.33354428832287447,
+      "learning_rate": 9.592030085891602e-05,
+      "loss": 0.5992,
+      "step": 1319
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.33783856158488534,
+      "learning_rate": 9.579086196236482e-05,
+      "loss": 0.6165,
+      "step": 1320
+    },
+    {
+      "epoch": 0.5284,
+      "grad_norm": 0.3397692219995272,
+      "learning_rate": 9.56614301301174e-05,
+      "loss": 0.5972,
+      "step": 1321
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.3726436179559889,
+      "learning_rate": 9.553200557940253e-05,
+      "loss": 0.6567,
+      "step": 1322
+    },
+    {
+      "epoch": 0.5292,
+      "grad_norm": 0.3740094636261001,
+      "learning_rate": 9.540258852743676e-05,
+      "loss": 0.6072,
+      "step": 1323
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3473144180024521,
+      "learning_rate": 9.527317919142398e-05,
+      "loss": 0.6699,
+      "step": 1324
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.35049605499757824,
+      "learning_rate": 9.514377778855521e-05,
+      "loss": 0.6654,
+      "step": 1325
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.33126843882063756,
+      "learning_rate": 9.501438453600807e-05,
+      "loss": 0.6267,
+      "step": 1326
+    },
+    {
+      "epoch": 0.5308,
+      "grad_norm": 0.35051466769057354,
+      "learning_rate": 9.488499965094664e-05,
+      "loss": 0.6817,
+      "step": 1327
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.36063296938358125,
+      "learning_rate": 9.475562335052086e-05,
+      "loss": 0.6122,
+      "step": 1328
+    },
+    {
+      "epoch": 0.5316,
+      "grad_norm": 0.35092141745658173,
+      "learning_rate": 9.462625585186622e-05,
+      "loss": 0.6638,
+      "step": 1329
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.4218326001924773,
+      "learning_rate": 9.449689737210352e-05,
+      "loss": 0.5866,
+      "step": 1330
+    },
+    {
+      "epoch": 0.5324,
+      "grad_norm": 0.41197958155266357,
+      "learning_rate": 9.436754812833843e-05,
+      "loss": 0.6252,
+      "step": 1331
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3366671562252165,
+      "learning_rate": 9.423820833766108e-05,
+      "loss": 0.6093,
+      "step": 1332
+    },
+    {
+      "epoch": 0.5332,
+      "grad_norm": 0.34610849975686436,
+      "learning_rate": 9.410887821714571e-05,
+      "loss": 0.5994,
+      "step": 1333
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.34033293503836815,
+      "learning_rate": 9.39795579838504e-05,
+      "loss": 0.6112,
+      "step": 1334
+    },
+    {
+      "epoch": 0.534,
+      "grad_norm": 0.33741213458653624,
+      "learning_rate": 9.385024785481654e-05,
+      "loss": 0.6204,
+      "step": 1335
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3612458277810479,
+      "learning_rate": 9.372094804706867e-05,
+      "loss": 0.6882,
+      "step": 1336
+    },
+    {
+      "epoch": 0.5348,
+      "grad_norm": 0.33716664030270793,
+      "learning_rate": 9.359165877761397e-05,
+      "loss": 0.6369,
+      "step": 1337
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.34964309823147316,
+      "learning_rate": 9.346238026344186e-05,
+      "loss": 0.6314,
+      "step": 1338
+    },
+    {
+      "epoch": 0.5356,
+      "grad_norm": 0.34489818587790194,
+      "learning_rate": 9.333311272152386e-05,
+      "loss": 0.618,
+      "step": 1339
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3703409070384164,
+      "learning_rate": 9.320385636881283e-05,
+      "loss": 0.6189,
+      "step": 1340
+    },
+    {
+      "epoch": 0.5364,
+      "grad_norm": 0.32808582341634207,
+      "learning_rate": 9.307461142224318e-05,
+      "loss": 0.6187,
+      "step": 1341
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.3446626780159281,
+      "learning_rate": 9.294537809872991e-05,
+      "loss": 0.5834,
+      "step": 1342
+    },
+    {
+      "epoch": 0.5372,
+      "grad_norm": 0.3350436294866912,
+      "learning_rate": 9.281615661516864e-05,
+      "loss": 0.5831,
+      "step": 1343
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.34148315360487524,
+      "learning_rate": 9.268694718843503e-05,
+      "loss": 0.5881,
+      "step": 1344
+    },
+    {
+      "epoch": 0.538,
+      "grad_norm": 0.3601618590645133,
+      "learning_rate": 9.255775003538462e-05,
+      "loss": 0.636,
+      "step": 1345
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.3393424797889516,
+      "learning_rate": 9.242856537285227e-05,
+      "loss": 0.6274,
+      "step": 1346
+    },
+    {
+      "epoch": 0.5388,
+      "grad_norm": 0.343149145366101,
+      "learning_rate": 9.229939341765188e-05,
+      "loss": 0.6221,
+      "step": 1347
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.35692843628237547,
+      "learning_rate": 9.217023438657605e-05,
+      "loss": 0.6269,
+      "step": 1348
+    },
+    {
+      "epoch": 0.5396,
+      "grad_norm": 0.333803007205758,
+      "learning_rate": 9.204108849639565e-05,
+      "loss": 0.6241,
+      "step": 1349
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.3844682892008765,
+      "learning_rate": 9.19119559638596e-05,
+      "loss": 0.6593,
+      "step": 1350
+    },
+    {
+      "epoch": 0.5404,
+      "grad_norm": 0.394809207078214,
+      "learning_rate": 9.178283700569424e-05,
+      "loss": 0.6066,
+      "step": 1351
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.34849520821286584,
+      "learning_rate": 9.165373183860328e-05,
+      "loss": 0.6417,
+      "step": 1352
+    },
+    {
+      "epoch": 0.5412,
+      "grad_norm": 0.328419664069886,
+      "learning_rate": 9.152464067926717e-05,
+      "loss": 0.6278,
+      "step": 1353
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.33761062084738297,
+      "learning_rate": 9.139556374434288e-05,
+      "loss": 0.588,
+      "step": 1354
+    },
+    {
+      "epoch": 0.542,
+      "grad_norm": 0.3342884235812433,
+      "learning_rate": 9.126650125046361e-05,
+      "loss": 0.5918,
+      "step": 1355
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.34043589106532285,
+      "learning_rate": 9.113745341423817e-05,
+      "loss": 0.6224,
+      "step": 1356
+    },
+    {
+      "epoch": 0.5428,
+      "grad_norm": 0.3333134613118826,
+      "learning_rate": 9.100842045225084e-05,
+      "loss": 0.631,
+      "step": 1357
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.36662979303788956,
+      "learning_rate": 9.087940258106093e-05,
+      "loss": 0.6044,
+      "step": 1358
+    },
+    {
+      "epoch": 0.5436,
+      "grad_norm": 0.350954421324191,
+      "learning_rate": 9.075040001720248e-05,
+      "loss": 0.581,
+      "step": 1359
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.33733312179983765,
+      "learning_rate": 9.062141297718371e-05,
+      "loss": 0.6292,
+      "step": 1360
+    },
+    {
+      "epoch": 0.5444,
+      "grad_norm": 0.3549294970748782,
+      "learning_rate": 9.049244167748694e-05,
+      "loss": 0.6374,
+      "step": 1361
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.3602806648834775,
+      "learning_rate": 9.036348633456792e-05,
+      "loss": 0.6205,
+      "step": 1362
+    },
+    {
+      "epoch": 0.5452,
+      "grad_norm": 0.35985574803936327,
+      "learning_rate": 9.02345471648557e-05,
+      "loss": 0.6452,
+      "step": 1363
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3793161645024034,
+      "learning_rate": 9.010562438475225e-05,
+      "loss": 0.6225,
+      "step": 1364
+    },
+    {
+      "epoch": 0.546,
+      "grad_norm": 0.3340735735140277,
+      "learning_rate": 8.997671821063191e-05,
+      "loss": 0.624,
+      "step": 1365
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.3350381095178896,
+      "learning_rate": 8.984782885884119e-05,
+      "loss": 0.5865,
+      "step": 1366
+    },
+    {
+      "epoch": 0.5468,
+      "grad_norm": 0.35258269054173075,
+      "learning_rate": 8.971895654569841e-05,
+      "loss": 0.6198,
+      "step": 1367
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.36295131136689934,
+      "learning_rate": 8.959010148749323e-05,
+      "loss": 0.6587,
+      "step": 1368
+    },
+    {
+      "epoch": 0.5476,
+      "grad_norm": 0.37638320613834875,
+      "learning_rate": 8.94612639004864e-05,
+      "loss": 0.6694,
+      "step": 1369
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.3515046750583341,
+      "learning_rate": 8.933244400090937e-05,
+      "loss": 0.6454,
+      "step": 1370
+    },
+    {
+      "epoch": 0.5484,
+      "grad_norm": 0.3813241742673669,
+      "learning_rate": 8.920364200496379e-05,
+      "loss": 0.6057,
+      "step": 1371
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.32850438706934315,
+      "learning_rate": 8.907485812882137e-05,
+      "loss": 0.6019,
+      "step": 1372
+    },
+    {
+      "epoch": 0.5492,
+      "grad_norm": 0.34498959496933074,
+      "learning_rate": 8.894609258862339e-05,
+      "loss": 0.627,
+      "step": 1373
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.35667574091889104,
+      "learning_rate": 8.881734560048036e-05,
+      "loss": 0.6409,
+      "step": 1374
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.35507706295378344,
+      "learning_rate": 8.868861738047158e-05,
+      "loss": 0.627,
+      "step": 1375
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3711015801034512,
+      "learning_rate": 8.855990814464496e-05,
+      "loss": 0.6233,
+      "step": 1376
+    },
+    {
+      "epoch": 0.5508,
+      "grad_norm": 0.35645935528359046,
+      "learning_rate": 8.843121810901642e-05,
+      "loss": 0.6554,
+      "step": 1377
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.3403366655222805,
+      "learning_rate": 8.830254748956982e-05,
+      "loss": 0.629,
+      "step": 1378
+    },
+    {
+      "epoch": 0.5516,
+      "grad_norm": 0.37629625156447527,
+      "learning_rate": 8.817389650225631e-05,
+      "loss": 0.6182,
+      "step": 1379
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.345875362567636,
+      "learning_rate": 8.804526536299413e-05,
+      "loss": 0.6055,
+      "step": 1380
+    },
+    {
+      "epoch": 0.5524,
+      "grad_norm": 0.33404949695921016,
+      "learning_rate": 8.791665428766818e-05,
+      "loss": 0.627,
+      "step": 1381
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.4068141191571966,
+      "learning_rate": 8.778806349212968e-05,
+      "loss": 0.6711,
+      "step": 1382
+    },
+    {
+      "epoch": 0.5532,
+      "grad_norm": 0.3374017182640437,
+      "learning_rate": 8.765949319219595e-05,
+      "loss": 0.6103,
+      "step": 1383
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.34207581665394704,
+      "learning_rate": 8.753094360364972e-05,
+      "loss": 0.65,
+      "step": 1384
+    },
+    {
+      "epoch": 0.554,
+      "grad_norm": 0.3267454950329156,
+      "learning_rate": 8.740241494223911e-05,
+      "loss": 0.5939,
+      "step": 1385
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.35810071664815823,
+      "learning_rate": 8.727390742367699e-05,
+      "loss": 0.6359,
+      "step": 1386
+    },
+    {
+      "epoch": 0.5548,
+      "grad_norm": 0.3551505732144722,
+      "learning_rate": 8.714542126364079e-05,
+      "loss": 0.6481,
+      "step": 1387
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.34768341140094017,
+      "learning_rate": 8.701695667777221e-05,
+      "loss": 0.6867,
+      "step": 1388
+    },
+    {
+      "epoch": 0.5556,
+      "grad_norm": 0.3395311643499959,
+      "learning_rate": 8.688851388167656e-05,
+      "loss": 0.6051,
+      "step": 1389
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.34737002301926034,
+      "learning_rate": 8.676009309092272e-05,
+      "loss": 0.5916,
+      "step": 1390
+    },
+    {
+      "epoch": 0.5564,
+      "grad_norm": 0.35852557052994755,
+      "learning_rate": 8.663169452104247e-05,
+      "loss": 0.6135,
+      "step": 1391
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.33085468263069967,
+      "learning_rate": 8.650331838753057e-05,
+      "loss": 0.5551,
+      "step": 1392
+    },
+    {
+      "epoch": 0.5572,
+      "grad_norm": 0.36288272915032965,
+      "learning_rate": 8.637496490584385e-05,
+      "loss": 0.648,
+      "step": 1393
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.3593536841658371,
+      "learning_rate": 8.624663429140128e-05,
+      "loss": 0.6849,
+      "step": 1394
+    },
+    {
+      "epoch": 0.558,
+      "grad_norm": 0.3788466375356503,
+      "learning_rate": 8.611832675958336e-05,
+      "loss": 0.6558,
+      "step": 1395
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.3384717996689434,
+      "learning_rate": 8.59900425257319e-05,
+      "loss": 0.5839,
+      "step": 1396
+    },
+    {
+      "epoch": 0.5588,
+      "grad_norm": 0.3720588308067637,
+      "learning_rate": 8.586178180514968e-05,
+      "loss": 0.6398,
+      "step": 1397
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.3227102592124542,
+      "learning_rate": 8.573354481309985e-05,
+      "loss": 0.5731,
+      "step": 1398
+    },
+    {
+      "epoch": 0.5596,
+      "grad_norm": 0.36349581862674507,
+      "learning_rate": 8.560533176480587e-05,
+      "loss": 0.6128,
+      "step": 1399
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.393821757193281,
+      "learning_rate": 8.5477142875451e-05,
+      "loss": 0.6547,
+      "step": 1400
+    },
+    {
+      "epoch": 0.5604,
+      "grad_norm": 0.36439378783743004,
+      "learning_rate": 8.534897836017784e-05,
+      "loss": 0.6512,
+      "step": 1401
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.3411692652437216,
+      "learning_rate": 8.522083843408823e-05,
+      "loss": 0.5871,
+      "step": 1402
+    },
+    {
+      "epoch": 0.5612,
+      "grad_norm": 0.33776851446598916,
+      "learning_rate": 8.509272331224269e-05,
+      "loss": 0.5583,
+      "step": 1403
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3590738807699268,
+      "learning_rate": 8.496463320966005e-05,
+      "loss": 0.6391,
+      "step": 1404
+    },
+    {
+      "epoch": 0.562,
+      "grad_norm": 0.365703700029338,
+      "learning_rate": 8.48365683413172e-05,
+      "loss": 0.6617,
+      "step": 1405
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.3526122930372505,
+      "learning_rate": 8.470852892214874e-05,
+      "loss": 0.6446,
+      "step": 1406
+    },
+    {
+      "epoch": 0.5628,
+      "grad_norm": 0.36900049711306404,
+      "learning_rate": 8.458051516704644e-05,
+      "loss": 0.6279,
+      "step": 1407
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.36941146433944927,
+      "learning_rate": 8.445252729085906e-05,
+      "loss": 0.6276,
+      "step": 1408
+    },
+    {
+      "epoch": 0.5636,
+      "grad_norm": 0.3742118804570097,
+      "learning_rate": 8.432456550839195e-05,
+      "loss": 0.6203,
+      "step": 1409
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.34650767561250045,
+      "learning_rate": 8.419663003440657e-05,
+      "loss": 0.6507,
+      "step": 1410
+    },
+    {
+      "epoch": 0.5644,
+      "grad_norm": 0.43898204956051495,
+      "learning_rate": 8.406872108362034e-05,
+      "loss": 0.6228,
+      "step": 1411
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3607453694697088,
+      "learning_rate": 8.394083887070613e-05,
+      "loss": 0.6446,
+      "step": 1412
+    },
+    {
+      "epoch": 0.5652,
+      "grad_norm": 0.34182866489981933,
+      "learning_rate": 8.381298361029189e-05,
+      "loss": 0.6226,
+      "step": 1413
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.3742610368049245,
+      "learning_rate": 8.36851555169604e-05,
+      "loss": 0.6336,
+      "step": 1414
+    },
+    {
+      "epoch": 0.566,
+      "grad_norm": 0.3454849337082054,
+      "learning_rate": 8.355735480524874e-05,
+      "loss": 0.6192,
+      "step": 1415
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3502224039208517,
+      "learning_rate": 8.342958168964817e-05,
+      "loss": 0.6369,
+      "step": 1416
+    },
+    {
+      "epoch": 0.5668,
+      "grad_norm": 0.3718134030541887,
+      "learning_rate": 8.330183638460356e-05,
+      "loss": 0.6492,
+      "step": 1417
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.3624606680464668,
+      "learning_rate": 8.317411910451313e-05,
+      "loss": 0.6141,
+      "step": 1418
+    },
+    {
+      "epoch": 0.5676,
+      "grad_norm": 0.34742545895317145,
+      "learning_rate": 8.304643006372797e-05,
+      "loss": 0.6098,
+      "step": 1419
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.40057662795640137,
+      "learning_rate": 8.291876947655196e-05,
+      "loss": 0.6366,
+      "step": 1420
+    },
+    {
+      "epoch": 0.5684,
+      "grad_norm": 0.345727686669028,
+      "learning_rate": 8.279113755724111e-05,
+      "loss": 0.6614,
+      "step": 1421
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.3328107042605165,
+      "learning_rate": 8.266353452000326e-05,
+      "loss": 0.624,
+      "step": 1422
+    },
+    {
+      "epoch": 0.5692,
+      "grad_norm": 0.35151415097284505,
+      "learning_rate": 8.253596057899789e-05,
+      "loss": 0.5577,
+      "step": 1423
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.40594804322909833,
+      "learning_rate": 8.240841594833554e-05,
+      "loss": 0.5966,
+      "step": 1424
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.35567838303548605,
+      "learning_rate": 8.228090084207774e-05,
+      "loss": 0.6188,
+      "step": 1425
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.33426743422578514,
+      "learning_rate": 8.215341547423624e-05,
+      "loss": 0.573,
+      "step": 1426
+    },
+    {
+      "epoch": 0.5708,
+      "grad_norm": 0.3600479894510643,
+      "learning_rate": 8.202596005877306e-05,
+      "loss": 0.6321,
+      "step": 1427
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3486290301517772,
+      "learning_rate": 8.189853480959981e-05,
+      "loss": 0.6372,
+      "step": 1428
+    },
+    {
+      "epoch": 0.5716,
+      "grad_norm": 0.34223847404772584,
+      "learning_rate": 8.177113994057755e-05,
+      "loss": 0.6057,
+      "step": 1429
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.35597344622619104,
+      "learning_rate": 8.16437756655164e-05,
+      "loss": 0.6159,
+      "step": 1430
+    },
+    {
+      "epoch": 0.5724,
+      "grad_norm": 0.3582317597717354,
+      "learning_rate": 8.1516442198175e-05,
+      "loss": 0.6325,
+      "step": 1431
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.39871033733728933,
+      "learning_rate": 8.138913975226044e-05,
+      "loss": 0.6031,
+      "step": 1432
+    },
+    {
+      "epoch": 0.5732,
+      "grad_norm": 0.3529099409918213,
+      "learning_rate": 8.126186854142752e-05,
+      "loss": 0.6357,
+      "step": 1433
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.3398109519070245,
+      "learning_rate": 8.113462877927893e-05,
+      "loss": 0.6173,
+      "step": 1434
+    },
+    {
+      "epoch": 0.574,
+      "grad_norm": 0.3559734332419003,
+      "learning_rate": 8.100742067936431e-05,
+      "loss": 0.6419,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3327454815046734,
+      "learning_rate": 8.088024445518033e-05,
+      "loss": 0.626,
+      "step": 1436
+    },
+    {
+      "epoch": 0.5748,
+      "grad_norm": 0.3723573253181277,
+      "learning_rate": 8.075310032017e-05,
+      "loss": 0.6793,
+      "step": 1437
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.3626763820021229,
+      "learning_rate": 8.06259884877226e-05,
+      "loss": 0.6869,
+      "step": 1438
+    },
+    {
+      "epoch": 0.5756,
+      "grad_norm": 0.3369433680940479,
+      "learning_rate": 8.049890917117322e-05,
+      "loss": 0.587,
+      "step": 1439
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3620611041529325,
+      "learning_rate": 8.037186258380226e-05,
+      "loss": 0.5919,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5764,
+      "grad_norm": 0.3307415356263632,
+      "learning_rate": 8.024484893883529e-05,
+      "loss": 0.6069,
+      "step": 1441
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.3703035450620265,
+      "learning_rate": 8.01178684494425e-05,
+      "loss": 0.6679,
+      "step": 1442
+    },
+    {
+      "epoch": 0.5772,
+      "grad_norm": 0.34436065944666616,
+      "learning_rate": 7.99909213287385e-05,
+      "loss": 0.6307,
+      "step": 1443
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.3634717918826862,
+      "learning_rate": 7.986400778978193e-05,
+      "loss": 0.6651,
+      "step": 1444
+    },
+    {
+      "epoch": 0.578,
+      "grad_norm": 0.3670417232457763,
+      "learning_rate": 7.973712804557501e-05,
+      "loss": 0.607,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.355642465027834,
+      "learning_rate": 7.96102823090632e-05,
+      "loss": 0.6181,
+      "step": 1446
+    },
+    {
+      "epoch": 0.5788,
+      "grad_norm": 0.3516485913587409,
+      "learning_rate": 7.948347079313494e-05,
+      "loss": 0.6482,
+      "step": 1447
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3447910881142975,
+      "learning_rate": 7.935669371062133e-05,
+      "loss": 0.5983,
+      "step": 1448
+    },
+    {
+      "epoch": 0.5796,
+      "grad_norm": 0.3402079050458831,
+      "learning_rate": 7.922995127429548e-05,
+      "loss": 0.6111,
+      "step": 1449
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.36801272918948597,
+      "learning_rate": 7.91032436968725e-05,
+      "loss": 0.6572,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5804,
+      "grad_norm": 0.33889533218091145,
+      "learning_rate": 7.897657119100896e-05,
+      "loss": 0.6177,
+      "step": 1451
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.3644963953865768,
+      "learning_rate": 7.88499339693025e-05,
+      "loss": 0.5866,
+      "step": 1452
+    },
+    {
+      "epoch": 0.5812,
+      "grad_norm": 0.3588575229144209,
+      "learning_rate": 7.872333224429167e-05,
+      "loss": 0.6166,
+      "step": 1453
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.3372220593947156,
+      "learning_rate": 7.859676622845535e-05,
+      "loss": 0.5507,
+      "step": 1454
+    },
+    {
+      "epoch": 0.582,
+      "grad_norm": 0.3294336764623329,
+      "learning_rate": 7.847023613421251e-05,
+      "loss": 0.5814,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3621068399864894,
+      "learning_rate": 7.834374217392188e-05,
+      "loss": 0.5946,
+      "step": 1456
+    },
+    {
+      "epoch": 0.5828,
+      "grad_norm": 0.33904287966850216,
+      "learning_rate": 7.82172845598814e-05,
+      "loss": 0.6063,
+      "step": 1457
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.36175090834503343,
+      "learning_rate": 7.809086350432819e-05,
+      "loss": 0.6005,
+      "step": 1458
+    },
+    {
+      "epoch": 0.5836,
+      "grad_norm": 0.3571985631122103,
+      "learning_rate": 7.796447921943792e-05,
+      "loss": 0.6325,
+      "step": 1459
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3955170703935457,
+      "learning_rate": 7.78381319173246e-05,
+      "loss": 0.645,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5844,
+      "grad_norm": 0.33034514246320046,
+      "learning_rate": 7.771182181004005e-05,
+      "loss": 0.6078,
+      "step": 1461
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.374720598768528,
+      "learning_rate": 7.758554910957378e-05,
+      "loss": 0.6717,
+      "step": 1462
+    },
+    {
+      "epoch": 0.5852,
+      "grad_norm": 0.3536240940306809,
+      "learning_rate": 7.745931402785251e-05,
+      "loss": 0.5734,
+      "step": 1463
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3632092428720265,
+      "learning_rate": 7.73331167767398e-05,
+      "loss": 0.6438,
+      "step": 1464
+    },
+    {
+      "epoch": 0.586,
+      "grad_norm": 0.38339157480175035,
+      "learning_rate": 7.72069575680357e-05,
+      "loss": 0.6561,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.34138554844386654,
+      "learning_rate": 7.708083661347637e-05,
+      "loss": 0.6167,
+      "step": 1466
+    },
+    {
+      "epoch": 0.5868,
+      "grad_norm": 0.35147349734926064,
+      "learning_rate": 7.695475412473391e-05,
+      "loss": 0.6651,
+      "step": 1467
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.32850754726847736,
+      "learning_rate": 7.682871031341578e-05,
+      "loss": 0.5925,
+      "step": 1468
+    },
+    {
+      "epoch": 0.5876,
+      "grad_norm": 0.3566837414245581,
+      "learning_rate": 7.670270539106451e-05,
+      "loss": 0.6453,
+      "step": 1469
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.3327063941527943,
+      "learning_rate": 7.657673956915735e-05,
+      "loss": 0.6383,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5884,
+      "grad_norm": 0.327214303925731,
+      "learning_rate": 7.645081305910595e-05,
+      "loss": 0.6093,
+      "step": 1471
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3759942478506926,
+      "learning_rate": 7.632492607225604e-05,
+      "loss": 0.6267,
+      "step": 1472
+    },
+    {
+      "epoch": 0.5892,
+      "grad_norm": 0.335506912856553,
+      "learning_rate": 7.619907881988692e-05,
+      "loss": 0.5978,
+      "step": 1473
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.33225509143100557,
+      "learning_rate": 7.607327151321126e-05,
+      "loss": 0.6012,
+      "step": 1474
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.3301363288014174,
+      "learning_rate": 7.594750436337467e-05,
+      "loss": 0.5967,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3520220573224516,
+      "learning_rate": 7.582177758145532e-05,
+      "loss": 0.6253,
+      "step": 1476
+    },
+    {
+      "epoch": 0.5908,
+      "grad_norm": 0.35776203174188537,
+      "learning_rate": 7.569609137846376e-05,
+      "loss": 0.588,
+      "step": 1477
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.32838797616857823,
+      "learning_rate": 7.557044596534234e-05,
+      "loss": 0.6202,
+      "step": 1478
+    },
+    {
+      "epoch": 0.5916,
+      "grad_norm": 0.32378600448202055,
+      "learning_rate": 7.544484155296492e-05,
+      "loss": 0.6383,
+      "step": 1479
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.34562514537934313,
+      "learning_rate": 7.531927835213656e-05,
+      "loss": 0.6047,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5924,
+      "grad_norm": 0.3619700721042768,
+      "learning_rate": 7.519375657359331e-05,
+      "loss": 0.6079,
+      "step": 1481
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.3553866992644801,
+      "learning_rate": 7.506827642800145e-05,
+      "loss": 0.6085,
+      "step": 1482
+    },
+    {
+      "epoch": 0.5932,
+      "grad_norm": 0.3568363066675366,
+      "learning_rate": 7.494283812595761e-05,
+      "loss": 0.6437,
+      "step": 1483
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3328699680755155,
+      "learning_rate": 7.4817441877988e-05,
+      "loss": 0.5958,
+      "step": 1484
+    },
+    {
+      "epoch": 0.594,
+      "grad_norm": 0.3476380867250527,
+      "learning_rate": 7.469208789454838e-05,
+      "loss": 0.6237,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.39764315061535754,
+      "learning_rate": 7.456677638602355e-05,
+      "loss": 0.6256,
+      "step": 1486
+    },
+    {
+      "epoch": 0.5948,
+      "grad_norm": 0.3175764987422378,
+      "learning_rate": 7.444150756272704e-05,
+      "loss": 0.5968,
+      "step": 1487
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3471809013149163,
+      "learning_rate": 7.431628163490066e-05,
+      "loss": 0.6534,
+      "step": 1488
+    },
+    {
+      "epoch": 0.5956,
+      "grad_norm": 0.33912306226399297,
+      "learning_rate": 7.419109881271433e-05,
+      "loss": 0.6432,
+      "step": 1489
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.3671412851365395,
+      "learning_rate": 7.40659593062655e-05,
+      "loss": 0.6833,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5964,
+      "grad_norm": 0.34018469479626184,
+      "learning_rate": 7.394086332557906e-05,
+      "loss": 0.6123,
+      "step": 1491
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.3143271250669063,
+      "learning_rate": 7.38158110806068e-05,
+      "loss": 0.5757,
+      "step": 1492
+    },
+    {
+      "epoch": 0.5972,
+      "grad_norm": 0.3747457614576391,
+      "learning_rate": 7.369080278122705e-05,
+      "loss": 0.6242,
+      "step": 1493
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.3299584007994645,
+      "learning_rate": 7.356583863724442e-05,
+      "loss": 0.6144,
+      "step": 1494
+    },
+    {
+      "epoch": 0.598,
+      "grad_norm": 0.3413173438447968,
+      "learning_rate": 7.344091885838948e-05,
+      "loss": 0.5983,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.35731476648357896,
+      "learning_rate": 7.331604365431825e-05,
+      "loss": 0.6357,
+      "step": 1496
+    },
+    {
+      "epoch": 0.5988,
+      "grad_norm": 0.36985199008247144,
+      "learning_rate": 7.319121323461197e-05,
+      "loss": 0.6363,
+      "step": 1497
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.36139746941975087,
+      "learning_rate": 7.306642780877675e-05,
+      "loss": 0.6435,
+      "step": 1498
+    },
+    {
+      "epoch": 0.5996,
+      "grad_norm": 0.3681686163413637,
+      "learning_rate": 7.294168758624307e-05,
+      "loss": 0.6949,
+      "step": 1499
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3773067706293726,
+      "learning_rate": 7.281699277636572e-05,
+      "loss": 0.6184,
+      "step": 1500
+    },
+    {
+      "epoch": 0.6004,
+      "grad_norm": 0.3393541897664475,
+      "learning_rate": 7.269234358842314e-05,
+      "loss": 0.5896,
+      "step": 1501
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.32358105086572475,
+      "learning_rate": 7.256774023161728e-05,
+      "loss": 0.5677,
+      "step": 1502
+    },
+    {
+      "epoch": 0.6012,
+      "grad_norm": 0.3342513260527789,
+      "learning_rate": 7.244318291507309e-05,
+      "loss": 0.6213,
+      "step": 1503
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.31814745140145045,
+      "learning_rate": 7.231867184783826e-05,
+      "loss": 0.5922,
+      "step": 1504
+    },
+    {
+      "epoch": 0.602,
+      "grad_norm": 0.34784167366745294,
+      "learning_rate": 7.2194207238883e-05,
+      "loss": 0.63,
+      "step": 1505
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.36043986672576783,
+      "learning_rate": 7.206978929709935e-05,
+      "loss": 0.624,
+      "step": 1506
+    },
+    {
+      "epoch": 0.6028,
+      "grad_norm": 0.3711628694852042,
+      "learning_rate": 7.194541823130118e-05,
+      "loss": 0.5885,
+      "step": 1507
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.33722221627158105,
+      "learning_rate": 7.182109425022357e-05,
+      "loss": 0.5979,
+      "step": 1508
+    },
+    {
+      "epoch": 0.6036,
+      "grad_norm": 0.34251133433916997,
+      "learning_rate": 7.169681756252264e-05,
+      "loss": 0.5948,
+      "step": 1509
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.3295678289072513,
+      "learning_rate": 7.157258837677514e-05,
+      "loss": 0.5835,
+      "step": 1510
+    },
+    {
+      "epoch": 0.6044,
+      "grad_norm": 0.31868964772249,
+      "learning_rate": 7.144840690147811e-05,
+      "loss": 0.6026,
+      "step": 1511
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3407551877917364,
+      "learning_rate": 7.132427334504846e-05,
+      "loss": 0.5956,
+      "step": 1512
+    },
+    {
+      "epoch": 0.6052,
+      "grad_norm": 0.36301195896097443,
+      "learning_rate": 7.120018791582266e-05,
+      "loss": 0.6283,
+      "step": 1513
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.35832312783288844,
+      "learning_rate": 7.107615082205654e-05,
+      "loss": 0.6494,
+      "step": 1514
+    },
+    {
+      "epoch": 0.606,
+      "grad_norm": 0.3277676821441542,
+      "learning_rate": 7.095216227192467e-05,
+      "loss": 0.5795,
+      "step": 1515
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.34409138634054304,
+      "learning_rate": 7.082822247352023e-05,
+      "loss": 0.6172,
+      "step": 1516
+    },
+    {
+      "epoch": 0.6068,
+      "grad_norm": 0.33163630536725375,
+      "learning_rate": 7.07043316348545e-05,
+      "loss": 0.6148,
+      "step": 1517
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.3807080731525262,
+      "learning_rate": 7.058048996385664e-05,
+      "loss": 0.6268,
+      "step": 1518
+    },
+    {
+      "epoch": 0.6076,
+      "grad_norm": 0.3302425537167297,
+      "learning_rate": 7.045669766837333e-05,
+      "loss": 0.6181,
+      "step": 1519
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3416511838952011,
+      "learning_rate": 7.033295495616834e-05,
+      "loss": 0.6031,
+      "step": 1520
+    },
+    {
+      "epoch": 0.6084,
+      "grad_norm": 0.3708701738547392,
+      "learning_rate": 7.020926203492218e-05,
+      "loss": 0.6189,
+      "step": 1521
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.3337542319880197,
+      "learning_rate": 7.008561911223186e-05,
+      "loss": 0.6023,
+      "step": 1522
+    },
+    {
+      "epoch": 0.6092,
+      "grad_norm": 0.3685066491511751,
+      "learning_rate": 6.996202639561041e-05,
+      "loss": 0.6607,
+      "step": 1523
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.3530816436474091,
+      "learning_rate": 6.983848409248671e-05,
+      "loss": 0.6099,
+      "step": 1524
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.33140625458221923,
+      "learning_rate": 6.971499241020495e-05,
+      "loss": 0.6135,
+      "step": 1525
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.33283277876250533,
+      "learning_rate": 6.959155155602433e-05,
+      "loss": 0.5767,
+      "step": 1526
+    },
+    {
+      "epoch": 0.6108,
+      "grad_norm": 0.36481283269146,
+      "learning_rate": 6.946816173711878e-05,
+      "loss": 0.6464,
+      "step": 1527
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.34026942238835184,
+      "learning_rate": 6.934482316057663e-05,
+      "loss": 0.6132,
+      "step": 1528
+    },
+    {
+      "epoch": 0.6116,
+      "grad_norm": 0.3378818728280427,
+      "learning_rate": 6.922153603340016e-05,
+      "loss": 0.5984,
+      "step": 1529
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.34921102193272463,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.5649,
+      "step": 1530
+    },
+    {
+      "epoch": 0.6124,
+      "grad_norm": 0.3572602699436952,
+      "learning_rate": 6.897511695472123e-05,
+      "loss": 0.599,
+      "step": 1531
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.3487090468932393,
+      "learning_rate": 6.885198541679015e-05,
+      "loss": 0.586,
+      "step": 1532
+    },
+    {
+      "epoch": 0.6132,
+      "grad_norm": 0.33667924874566424,
+      "learning_rate": 6.872890615536694e-05,
+      "loss": 0.6496,
+      "step": 1533
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.33375982398075066,
+      "learning_rate": 6.860587937701862e-05,
+      "loss": 0.5801,
+      "step": 1534
+    },
+    {
+      "epoch": 0.614,
+      "grad_norm": 0.33669078924570117,
+      "learning_rate": 6.848290528822416e-05,
+      "loss": 0.6009,
+      "step": 1535
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.34801084739291366,
+      "learning_rate": 6.835998409537412e-05,
+      "loss": 0.601,
+      "step": 1536
+    },
+    {
+      "epoch": 0.6148,
+      "grad_norm": 0.35856953999733276,
+      "learning_rate": 6.823711600477025e-05,
+      "loss": 0.6364,
+      "step": 1537
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.4207855839311045,
+      "learning_rate": 6.811430122262529e-05,
+      "loss": 0.6066,
+      "step": 1538
+    },
+    {
+      "epoch": 0.6156,
+      "grad_norm": 0.34510379391800705,
+      "learning_rate": 6.799153995506233e-05,
+      "loss": 0.5881,
+      "step": 1539
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.3483349424676601,
+      "learning_rate": 6.786883240811479e-05,
+      "loss": 0.6192,
+      "step": 1540
+    },
+    {
+      "epoch": 0.6164,
+      "grad_norm": 0.3426317429269225,
+      "learning_rate": 6.774617878772582e-05,
+      "loss": 0.5891,
+      "step": 1541
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.3570391863388374,
+      "learning_rate": 6.76235792997482e-05,
+      "loss": 0.6508,
+      "step": 1542
+    },
+    {
+      "epoch": 0.6172,
+      "grad_norm": 0.33436810655168264,
+      "learning_rate": 6.750103414994374e-05,
+      "loss": 0.6168,
+      "step": 1543
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.33635108547988973,
+      "learning_rate": 6.737854354398307e-05,
+      "loss": 0.6019,
+      "step": 1544
+    },
+    {
+      "epoch": 0.618,
+      "grad_norm": 0.4032601744512013,
+      "learning_rate": 6.725610768744534e-05,
+      "loss": 0.6302,
+      "step": 1545
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.3435958627550203,
+      "learning_rate": 6.713372678581774e-05,
+      "loss": 0.6156,
+      "step": 1546
+    },
+    {
+      "epoch": 0.6188,
+      "grad_norm": 0.34363295200916266,
+      "learning_rate": 6.70114010444953e-05,
+      "loss": 0.6079,
+      "step": 1547
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.34363602325097997,
+      "learning_rate": 6.688913066878039e-05,
+      "loss": 0.6334,
+      "step": 1548
+    },
+    {
+      "epoch": 0.6196,
+      "grad_norm": 0.32753115254913256,
+      "learning_rate": 6.676691586388255e-05,
+      "loss": 0.6111,
+      "step": 1549
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.35690818367626614,
+      "learning_rate": 6.664475683491796e-05,
+      "loss": 0.62,
+      "step": 1550
+    },
+    {
+      "epoch": 0.6204,
+      "grad_norm": 0.34196251229699387,
+      "learning_rate": 6.652265378690922e-05,
+      "loss": 0.6442,
+      "step": 1551
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.33911356642323115,
+      "learning_rate": 6.640060692478509e-05,
+      "loss": 0.5776,
+      "step": 1552
+    },
+    {
+      "epoch": 0.6212,
+      "grad_norm": 0.3346480835046118,
+      "learning_rate": 6.627861645337984e-05,
+      "loss": 0.6016,
+      "step": 1553
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.347171889573341,
+      "learning_rate": 6.615668257743321e-05,
+      "loss": 0.6187,
+      "step": 1554
+    },
+    {
+      "epoch": 0.622,
+      "grad_norm": 0.3304898760604142,
+      "learning_rate": 6.603480550158995e-05,
+      "loss": 0.5886,
+      "step": 1555
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3668158254051957,
+      "learning_rate": 6.591298543039949e-05,
+      "loss": 0.6175,
+      "step": 1556
+    },
+    {
+      "epoch": 0.6228,
+      "grad_norm": 0.36068648999637415,
+      "learning_rate": 6.579122256831551e-05,
+      "loss": 0.578,
+      "step": 1557
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.324636999369861,
+      "learning_rate": 6.56695171196958e-05,
+      "loss": 0.5967,
+      "step": 1558
+    },
+    {
+      "epoch": 0.6236,
+      "grad_norm": 0.3362041332820818,
+      "learning_rate": 6.554786928880164e-05,
+      "loss": 0.6431,
+      "step": 1559
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3182739366074914,
+      "learning_rate": 6.542627927979771e-05,
+      "loss": 0.5741,
+      "step": 1560
+    },
+    {
+      "epoch": 0.6244,
+      "grad_norm": 0.3301255157545653,
+      "learning_rate": 6.530474729675167e-05,
+      "loss": 0.6307,
+      "step": 1561
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.3159628923173484,
+      "learning_rate": 6.518327354363374e-05,
+      "loss": 0.6092,
+      "step": 1562
+    },
+    {
+      "epoch": 0.6252,
+      "grad_norm": 0.3219719456819235,
+      "learning_rate": 6.506185822431638e-05,
+      "loss": 0.6225,
+      "step": 1563
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.31618004869884997,
+      "learning_rate": 6.494050154257407e-05,
+      "loss": 0.601,
+      "step": 1564
+    },
+    {
+      "epoch": 0.626,
+      "grad_norm": 0.34916825527224976,
+      "learning_rate": 6.481920370208274e-05,
+      "loss": 0.5667,
+      "step": 1565
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.3492294364000783,
+      "learning_rate": 6.469796490641973e-05,
+      "loss": 0.6265,
+      "step": 1566
+    },
+    {
+      "epoch": 0.6268,
+      "grad_norm": 0.33940934685888674,
+      "learning_rate": 6.457678535906322e-05,
+      "loss": 0.548,
+      "step": 1567
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3339632201390132,
+      "learning_rate": 6.445566526339188e-05,
+      "loss": 0.576,
+      "step": 1568
+    },
+    {
+      "epoch": 0.6276,
+      "grad_norm": 0.35130711286775096,
+      "learning_rate": 6.433460482268464e-05,
+      "loss": 0.621,
+      "step": 1569
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.36342412613863156,
+      "learning_rate": 6.42136042401204e-05,
+      "loss": 0.6298,
+      "step": 1570
+    },
+    {
+      "epoch": 0.6284,
+      "grad_norm": 0.3365696836642408,
+      "learning_rate": 6.409266371877751e-05,
+      "loss": 0.605,
+      "step": 1571
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.35262966344092284,
+      "learning_rate": 6.397178346163349e-05,
+      "loss": 0.6144,
+      "step": 1572
+    },
+    {
+      "epoch": 0.6292,
+      "grad_norm": 0.3479282156154242,
+      "learning_rate": 6.38509636715648e-05,
+      "loss": 0.6308,
+      "step": 1573
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.35797898629366776,
+      "learning_rate": 6.373020455134634e-05,
+      "loss": 0.6108,
+      "step": 1574
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.3581346498101776,
+      "learning_rate": 6.360950630365126e-05,
+      "loss": 0.6253,
+      "step": 1575
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3415194837946404,
+      "learning_rate": 6.34888691310505e-05,
+      "loss": 0.5685,
+      "step": 1576
+    },
+    {
+      "epoch": 0.6308,
+      "grad_norm": 0.3476884710311523,
+      "learning_rate": 6.33682932360125e-05,
+      "loss": 0.5958,
+      "step": 1577
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.3535544533307336,
+      "learning_rate": 6.324777882090287e-05,
+      "loss": 0.6489,
+      "step": 1578
+    },
+    {
+      "epoch": 0.6316,
+      "grad_norm": 0.3247012241756796,
+      "learning_rate": 6.312732608798397e-05,
+      "loss": 0.5907,
+      "step": 1579
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.35701198841586984,
+      "learning_rate": 6.300693523941482e-05,
+      "loss": 0.6268,
+      "step": 1580
+    },
+    {
+      "epoch": 0.6324,
+      "grad_norm": 0.3530680640497662,
+      "learning_rate": 6.288660647725034e-05,
+      "loss": 0.6148,
+      "step": 1581
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.32991564455360417,
+      "learning_rate": 6.276634000344143e-05,
+      "loss": 0.5772,
+      "step": 1582
+    },
+    {
+      "epoch": 0.6332,
+      "grad_norm": 0.3395810927203878,
+      "learning_rate": 6.264613601983435e-05,
+      "loss": 0.6066,
+      "step": 1583
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3373112099227295,
+      "learning_rate": 6.25259947281705e-05,
+      "loss": 0.5861,
+      "step": 1584
+    },
+    {
+      "epoch": 0.634,
+      "grad_norm": 0.334228708671868,
+      "learning_rate": 6.24059163300861e-05,
+      "loss": 0.5617,
+      "step": 1585
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.345907115889204,
+      "learning_rate": 6.22859010271118e-05,
+      "loss": 0.6248,
+      "step": 1586
+    },
+    {
+      "epoch": 0.6348,
+      "grad_norm": 0.3779567983486365,
+      "learning_rate": 6.216594902067232e-05,
+      "loss": 0.6303,
+      "step": 1587
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3583018978395348,
+      "learning_rate": 6.204606051208617e-05,
+      "loss": 0.6137,
+      "step": 1588
+    },
+    {
+      "epoch": 0.6356,
+      "grad_norm": 0.3433342008135954,
+      "learning_rate": 6.192623570256535e-05,
+      "loss": 0.6089,
+      "step": 1589
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.3353786937623075,
+      "learning_rate": 6.180647479321485e-05,
+      "loss": 0.6303,
+      "step": 1590
+    },
+    {
+      "epoch": 0.6364,
+      "grad_norm": 0.34009862544113934,
+      "learning_rate": 6.168677798503247e-05,
+      "loss": 0.6307,
+      "step": 1591
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3736520602666011,
+      "learning_rate": 6.156714547890838e-05,
+      "loss": 0.5946,
+      "step": 1592
+    },
+    {
+      "epoch": 0.6372,
+      "grad_norm": 0.3435425647296458,
+      "learning_rate": 6.144757747562489e-05,
+      "loss": 0.6302,
+      "step": 1593
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.32727000904697967,
+      "learning_rate": 6.13280741758561e-05,
+      "loss": 0.6127,
+      "step": 1594
+    },
+    {
+      "epoch": 0.638,
+      "grad_norm": 0.36753270428283846,
+      "learning_rate": 6.120863578016735e-05,
+      "loss": 0.632,
+      "step": 1595
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.32514548505631924,
+      "learning_rate": 6.108926248901521e-05,
+      "loss": 0.5782,
+      "step": 1596
+    },
+    {
+      "epoch": 0.6388,
+      "grad_norm": 0.3207337446453717,
+      "learning_rate": 6.096995450274692e-05,
+      "loss": 0.5836,
+      "step": 1597
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.358030142284751,
+      "learning_rate": 6.085071202160004e-05,
+      "loss": 0.6495,
+      "step": 1598
+    },
+    {
+      "epoch": 0.6396,
+      "grad_norm": 0.3508927594214803,
+      "learning_rate": 6.0731535245702366e-05,
+      "loss": 0.5946,
+      "step": 1599
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3386012278500723,
+      "learning_rate": 6.061242437507131e-05,
+      "loss": 0.6233,
+      "step": 1600
+    },
+    {
+      "epoch": 0.6404,
+      "grad_norm": 0.354317079178696,
+      "learning_rate": 6.049337960961362e-05,
+      "loss": 0.5686,
+      "step": 1601
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.3457713745654834,
+      "learning_rate": 6.0374401149125204e-05,
+      "loss": 0.6255,
+      "step": 1602
+    },
+    {
+      "epoch": 0.6412,
+      "grad_norm": 0.3433001073124816,
+      "learning_rate": 6.025548919329067e-05,
+      "loss": 0.629,
+      "step": 1603
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.3423525871043034,
+      "learning_rate": 6.013664394168297e-05,
+      "loss": 0.5807,
+      "step": 1604
+    },
+    {
+      "epoch": 0.642,
+      "grad_norm": 0.3474180322514183,
+      "learning_rate": 6.00178655937631e-05,
+      "loss": 0.6273,
+      "step": 1605
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.3614314983708964,
+      "learning_rate": 5.989915434887985e-05,
+      "loss": 0.5821,
+      "step": 1606
+    },
+    {
+      "epoch": 0.6428,
+      "grad_norm": 0.3478797951136011,
+      "learning_rate": 5.978051040626924e-05,
+      "loss": 0.5748,
+      "step": 1607
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.356975941297783,
+      "learning_rate": 5.9661933965054516e-05,
+      "loss": 0.5878,
+      "step": 1608
+    },
+    {
+      "epoch": 0.6436,
+      "grad_norm": 0.3435682743215697,
+      "learning_rate": 5.9543425224245534e-05,
+      "loss": 0.5934,
+      "step": 1609
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.3425447440862132,
+      "learning_rate": 5.942498438273849e-05,
+      "loss": 0.6592,
+      "step": 1610
+    },
+    {
+      "epoch": 0.6444,
+      "grad_norm": 0.32859194465404135,
+      "learning_rate": 5.9306611639315724e-05,
+      "loss": 0.6402,
+      "step": 1611
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.32171709661226766,
+      "learning_rate": 5.9188307192645145e-05,
+      "loss": 0.5754,
+      "step": 1612
+    },
+    {
+      "epoch": 0.6452,
+      "grad_norm": 0.3664751697653012,
+      "learning_rate": 5.907007124128023e-05,
+      "loss": 0.6061,
+      "step": 1613
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.3561084222916176,
+      "learning_rate": 5.895190398365935e-05,
+      "loss": 0.6439,
+      "step": 1614
+    },
+    {
+      "epoch": 0.646,
+      "grad_norm": 0.35906093816510193,
+      "learning_rate": 5.883380561810563e-05,
+      "loss": 0.6493,
+      "step": 1615
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3791646125351826,
+      "learning_rate": 5.871577634282654e-05,
+      "loss": 0.6317,
+      "step": 1616
+    },
+    {
+      "epoch": 0.6468,
+      "grad_norm": 0.3407556331076295,
+      "learning_rate": 5.8597816355913684e-05,
+      "loss": 0.5503,
+      "step": 1617
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.3465952031897078,
+      "learning_rate": 5.84799258553423e-05,
+      "loss": 0.6688,
+      "step": 1618
+    },
+    {
+      "epoch": 0.6476,
+      "grad_norm": 0.3397174114357965,
+      "learning_rate": 5.836210503897099e-05,
+      "loss": 0.6059,
+      "step": 1619
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.34348908309352155,
+      "learning_rate": 5.82443541045415e-05,
+      "loss": 0.6413,
+      "step": 1620
+    },
+    {
+      "epoch": 0.6484,
+      "grad_norm": 0.3644566121091593,
+      "learning_rate": 5.812667324967813e-05,
+      "loss": 0.6031,
+      "step": 1621
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.33281137684175827,
+      "learning_rate": 5.8009062671887726e-05,
+      "loss": 0.5977,
+      "step": 1622
+    },
+    {
+      "epoch": 0.6492,
+      "grad_norm": 0.34054041000294205,
+      "learning_rate": 5.789152256855916e-05,
+      "loss": 0.6005,
+      "step": 1623
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.35050290727508115,
+      "learning_rate": 5.7774053136962935e-05,
+      "loss": 0.6152,
+      "step": 1624
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.35763787377139644,
+      "learning_rate": 5.765665457425102e-05,
+      "loss": 0.614,
+      "step": 1625
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.3573031278750954,
+      "learning_rate": 5.753932707745635e-05,
+      "loss": 0.607,
+      "step": 1626
+    },
+    {
+      "epoch": 0.6508,
+      "grad_norm": 0.36203299996635585,
+      "learning_rate": 5.7422070843492734e-05,
+      "loss": 0.6577,
+      "step": 1627
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3533041362444965,
+      "learning_rate": 5.730488606915429e-05,
+      "loss": 0.6193,
+      "step": 1628
+    },
+    {
+      "epoch": 0.6516,
+      "grad_norm": 0.39042971332239085,
+      "learning_rate": 5.7187772951115236e-05,
+      "loss": 0.5948,
+      "step": 1629
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.3615896964890657,
+      "learning_rate": 5.707073168592942e-05,
+      "loss": 0.6288,
+      "step": 1630
+    },
+    {
+      "epoch": 0.6524,
+      "grad_norm": 0.3872827555876587,
+      "learning_rate": 5.695376247003025e-05,
+      "loss": 0.6421,
+      "step": 1631
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3650559383497089,
+      "learning_rate": 5.6836865499730176e-05,
+      "loss": 0.61,
+      "step": 1632
+    },
+    {
+      "epoch": 0.6532,
+      "grad_norm": 0.3293487025294577,
+      "learning_rate": 5.6720040971220326e-05,
+      "loss": 0.5818,
+      "step": 1633
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.372959382045621,
+      "learning_rate": 5.660328908057028e-05,
+      "loss": 0.5676,
+      "step": 1634
+    },
+    {
+      "epoch": 0.654,
+      "grad_norm": 0.3387007113246456,
+      "learning_rate": 5.648661002372768e-05,
+      "loss": 0.5966,
+      "step": 1635
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.3264396720269018,
+      "learning_rate": 5.637000399651804e-05,
+      "loss": 0.5993,
+      "step": 1636
+    },
+    {
+      "epoch": 0.6548,
+      "grad_norm": 0.31000237024948457,
+      "learning_rate": 5.6253471194644214e-05,
+      "loss": 0.5697,
+      "step": 1637
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.3526078597111627,
+      "learning_rate": 5.613701181368618e-05,
+      "loss": 0.6248,
+      "step": 1638
+    },
+    {
+      "epoch": 0.6556,
+      "grad_norm": 0.32001206541055655,
+      "learning_rate": 5.602062604910063e-05,
+      "loss": 0.6204,
+      "step": 1639
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3496514013896952,
+      "learning_rate": 5.590431409622081e-05,
+      "loss": 0.6022,
+      "step": 1640
+    },
+    {
+      "epoch": 0.6564,
+      "grad_norm": 0.34586292300320254,
+      "learning_rate": 5.578807615025607e-05,
+      "loss": 0.598,
+      "step": 1641
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.3432127154514644,
+      "learning_rate": 5.567191240629151e-05,
+      "loss": 0.5738,
+      "step": 1642
+    },
+    {
+      "epoch": 0.6572,
+      "grad_norm": 0.3364872402176342,
+      "learning_rate": 5.555582305928766e-05,
+      "loss": 0.5496,
+      "step": 1643
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.35323619348013563,
+      "learning_rate": 5.543980830408022e-05,
+      "loss": 0.6186,
+      "step": 1644
+    },
+    {
+      "epoch": 0.658,
+      "grad_norm": 0.35721395185667604,
+      "learning_rate": 5.532386833537977e-05,
+      "loss": 0.6382,
+      "step": 1645
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.33627601356894526,
+      "learning_rate": 5.520800334777132e-05,
+      "loss": 0.594,
+      "step": 1646
+    },
+    {
+      "epoch": 0.6588,
+      "grad_norm": 0.34691227027823734,
+      "learning_rate": 5.5092213535714034e-05,
+      "loss": 0.6038,
+      "step": 1647
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.32413368663446984,
+      "learning_rate": 5.497649909354083e-05,
+      "loss": 0.5532,
+      "step": 1648
+    },
+    {
+      "epoch": 0.6596,
+      "grad_norm": 0.3294145113172244,
+      "learning_rate": 5.4860860215458286e-05,
+      "loss": 0.5638,
+      "step": 1649
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.3496300712042131,
+      "learning_rate": 5.474529709554612e-05,
+      "loss": 0.5839,
+      "step": 1650
+    },
+    {
+      "epoch": 0.6604,
+      "grad_norm": 0.33031214213845245,
+      "learning_rate": 5.4629809927756794e-05,
+      "loss": 0.5767,
+      "step": 1651
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.37293263941655797,
+      "learning_rate": 5.451439890591539e-05,
+      "loss": 0.6267,
+      "step": 1652
+    },
+    {
+      "epoch": 0.6612,
+      "grad_norm": 0.33245490315332754,
+      "learning_rate": 5.439906422371914e-05,
+      "loss": 0.5575,
+      "step": 1653
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.32580261091835244,
+      "learning_rate": 5.42838060747372e-05,
+      "loss": 0.596,
+      "step": 1654
+    },
+    {
+      "epoch": 0.662,
+      "grad_norm": 0.35853580611736774,
+      "learning_rate": 5.416862465241033e-05,
+      "loss": 0.6202,
+      "step": 1655
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3498791525095215,
+      "learning_rate": 5.4053520150050384e-05,
+      "loss": 0.563,
+      "step": 1656
+    },
+    {
+      "epoch": 0.6628,
+      "grad_norm": 0.39668693688774814,
+      "learning_rate": 5.393849276084018e-05,
+      "loss": 0.63,
+      "step": 1657
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.3454852702854712,
+      "learning_rate": 5.382354267783316e-05,
+      "loss": 0.6302,
+      "step": 1658
+    },
+    {
+      "epoch": 0.6636,
+      "grad_norm": 0.3763804250470061,
+      "learning_rate": 5.370867009395294e-05,
+      "loss": 0.6038,
+      "step": 1659
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.36015198509194524,
+      "learning_rate": 5.3593875201993174e-05,
+      "loss": 0.5984,
+      "step": 1660
+    },
+    {
+      "epoch": 0.6644,
+      "grad_norm": 0.3507800181470306,
+      "learning_rate": 5.347915819461699e-05,
+      "loss": 0.5994,
+      "step": 1661
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.34420271337836417,
+      "learning_rate": 5.336451926435688e-05,
+      "loss": 0.5801,
+      "step": 1662
+    },
+    {
+      "epoch": 0.6652,
+      "grad_norm": 0.33850927670740566,
+      "learning_rate": 5.3249958603614305e-05,
+      "loss": 0.5739,
+      "step": 1663
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.351094793513127,
+      "learning_rate": 5.3135476404659366e-05,
+      "loss": 0.5978,
+      "step": 1664
+    },
+    {
+      "epoch": 0.666,
+      "grad_norm": 0.335916258589442,
+      "learning_rate": 5.302107285963045e-05,
+      "loss": 0.5728,
+      "step": 1665
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.30513854130361645,
+      "learning_rate": 5.290674816053389e-05,
+      "loss": 0.5611,
+      "step": 1666
+    },
+    {
+      "epoch": 0.6668,
+      "grad_norm": 0.35943567473275423,
+      "learning_rate": 5.279250249924383e-05,
+      "loss": 0.6203,
+      "step": 1667
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3428927765200386,
+      "learning_rate": 5.26783360675016e-05,
+      "loss": 0.6008,
+      "step": 1668
+    },
+    {
+      "epoch": 0.6676,
+      "grad_norm": 0.35017777266739336,
+      "learning_rate": 5.25642490569157e-05,
+      "loss": 0.6162,
+      "step": 1669
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.3885116261542779,
+      "learning_rate": 5.245024165896126e-05,
+      "loss": 0.6497,
+      "step": 1670
+    },
+    {
+      "epoch": 0.6684,
+      "grad_norm": 0.3400541914465486,
+      "learning_rate": 5.233631406497976e-05,
+      "loss": 0.6133,
+      "step": 1671
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3275659916402285,
+      "learning_rate": 5.222246646617886e-05,
+      "loss": 0.5754,
+      "step": 1672
+    },
+    {
+      "epoch": 0.6692,
+      "grad_norm": 0.3630102866461434,
+      "learning_rate": 5.2108699053631784e-05,
+      "loss": 0.5595,
+      "step": 1673
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.36478597471794466,
+      "learning_rate": 5.199501201827741e-05,
+      "loss": 0.5864,
+      "step": 1674
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.36960611381067765,
+      "learning_rate": 5.1881405550919493e-05,
+      "loss": 0.6221,
+      "step": 1675
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.3450897290147815,
+      "learning_rate": 5.176787984222674e-05,
+      "loss": 0.6098,
+      "step": 1676
+    },
+    {
+      "epoch": 0.6708,
+      "grad_norm": 0.3821327690534526,
+      "learning_rate": 5.1654435082732175e-05,
+      "loss": 0.6306,
+      "step": 1677
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.3413146874905284,
+      "learning_rate": 5.1541071462833115e-05,
+      "loss": 0.6033,
+      "step": 1678
+    },
+    {
+      "epoch": 0.6716,
+      "grad_norm": 0.3553980761339589,
+      "learning_rate": 5.1427789172790566e-05,
+      "loss": 0.6723,
+      "step": 1679
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.34657476100242796,
+      "learning_rate": 5.1314588402729044e-05,
+      "loss": 0.6361,
+      "step": 1680
+    },
+    {
+      "epoch": 0.6724,
+      "grad_norm": 0.3495602397264736,
+      "learning_rate": 5.120146934263638e-05,
+      "loss": 0.6052,
+      "step": 1681
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.3308800927928125,
+      "learning_rate": 5.10884321823631e-05,
+      "loss": 0.5832,
+      "step": 1682
+    },
+    {
+      "epoch": 0.6732,
+      "grad_norm": 0.34180578026316344,
+      "learning_rate": 5.0975477111622426e-05,
+      "loss": 0.6362,
+      "step": 1683
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.316429545263734,
+      "learning_rate": 5.086260431998967e-05,
+      "loss": 0.5789,
+      "step": 1684
+    },
+    {
+      "epoch": 0.674,
+      "grad_norm": 0.3473239200277124,
+      "learning_rate": 5.074981399690218e-05,
+      "loss": 0.6228,
+      "step": 1685
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.3212290043833729,
+      "learning_rate": 5.063710633165881e-05,
+      "loss": 0.5681,
+      "step": 1686
+    },
+    {
+      "epoch": 0.6748,
+      "grad_norm": 0.3376077723916017,
+      "learning_rate": 5.052448151341967e-05,
+      "loss": 0.6022,
+      "step": 1687
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.34382234786732874,
+      "learning_rate": 5.0411939731205946e-05,
+      "loss": 0.5896,
+      "step": 1688
+    },
+    {
+      "epoch": 0.6756,
+      "grad_norm": 0.334168296603689,
+      "learning_rate": 5.0299481173899296e-05,
+      "loss": 0.6305,
+      "step": 1689
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.32108508408255027,
+      "learning_rate": 5.018710603024187e-05,
+      "loss": 0.568,
+      "step": 1690
+    },
+    {
+      "epoch": 0.6764,
+      "grad_norm": 0.337426417658709,
+      "learning_rate": 5.0074814488835665e-05,
+      "loss": 0.6087,
+      "step": 1691
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3325197469675423,
+      "learning_rate": 4.99626067381425e-05,
+      "loss": 0.5546,
+      "step": 1692
+    },
+    {
+      "epoch": 0.6772,
+      "grad_norm": 0.3344666617338685,
+      "learning_rate": 4.9850482966483455e-05,
+      "loss": 0.5597,
+      "step": 1693
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.35372497641394407,
+      "learning_rate": 4.973844336203879e-05,
+      "loss": 0.6546,
+      "step": 1694
+    },
+    {
+      "epoch": 0.678,
+      "grad_norm": 0.3461997501455237,
+      "learning_rate": 4.962648811284738e-05,
+      "loss": 0.6123,
+      "step": 1695
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.35165958624702814,
+      "learning_rate": 4.951461740680655e-05,
+      "loss": 0.6477,
+      "step": 1696
+    },
+    {
+      "epoch": 0.6788,
+      "grad_norm": 0.3395316114162352,
+      "learning_rate": 4.9402831431671834e-05,
+      "loss": 0.5678,
+      "step": 1697
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.3291753734724247,
+      "learning_rate": 4.929113037505641e-05,
+      "loss": 0.5985,
+      "step": 1698
+    },
+    {
+      "epoch": 0.6796,
+      "grad_norm": 0.36381547209597087,
+      "learning_rate": 4.91795144244311e-05,
+      "loss": 0.6167,
+      "step": 1699
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.32974021690173844,
+      "learning_rate": 4.9067983767123736e-05,
+      "loss": 0.5671,
+      "step": 1700
+    },
+    {
+      "epoch": 0.6804,
+      "grad_norm": 0.3320824590238343,
+      "learning_rate": 4.8956538590319055e-05,
+      "loss": 0.5249,
+      "step": 1701
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.3419505556167843,
+      "learning_rate": 4.884517908105837e-05,
+      "loss": 0.5919,
+      "step": 1702
+    },
+    {
+      "epoch": 0.6812,
+      "grad_norm": 0.3397681816840063,
+      "learning_rate": 4.873390542623922e-05,
+      "loss": 0.5519,
+      "step": 1703
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3875580450426956,
+      "learning_rate": 4.8622717812615e-05,
+      "loss": 0.653,
+      "step": 1704
+    },
+    {
+      "epoch": 0.682,
+      "grad_norm": 0.37924891875909367,
+      "learning_rate": 4.851161642679466e-05,
+      "loss": 0.5606,
+      "step": 1705
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.368679004842044,
+      "learning_rate": 4.840060145524254e-05,
+      "loss": 0.676,
+      "step": 1706
+    },
+    {
+      "epoch": 0.6828,
+      "grad_norm": 0.3276517811322601,
+      "learning_rate": 4.8289673084277954e-05,
+      "loss": 0.5866,
+      "step": 1707
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.35101894023359137,
+      "learning_rate": 4.817883150007474e-05,
+      "loss": 0.6115,
+      "step": 1708
+    },
+    {
+      "epoch": 0.6836,
+      "grad_norm": 0.34967835358066,
+      "learning_rate": 4.80680768886612e-05,
+      "loss": 0.6214,
+      "step": 1709
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.337382406007058,
+      "learning_rate": 4.795740943591955e-05,
+      "loss": 0.6163,
+      "step": 1710
+    },
+    {
+      "epoch": 0.6844,
+      "grad_norm": 0.3232351705102931,
+      "learning_rate": 4.7846829327585876e-05,
+      "loss": 0.5935,
+      "step": 1711
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.32085864274891623,
+      "learning_rate": 4.77363367492496e-05,
+      "loss": 0.5997,
+      "step": 1712
+    },
+    {
+      "epoch": 0.6852,
+      "grad_norm": 0.33468587225981816,
+      "learning_rate": 4.762593188635321e-05,
+      "loss": 0.6233,
+      "step": 1713
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.39442272618975327,
+      "learning_rate": 4.751561492419202e-05,
+      "loss": 0.6117,
+      "step": 1714
+    },
+    {
+      "epoch": 0.686,
+      "grad_norm": 0.3461264133463066,
+      "learning_rate": 4.74053860479137e-05,
+      "loss": 0.5971,
+      "step": 1715
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.34582394392278937,
+      "learning_rate": 4.729524544251837e-05,
+      "loss": 0.6283,
+      "step": 1716
+    },
+    {
+      "epoch": 0.6868,
+      "grad_norm": 0.3709121160353713,
+      "learning_rate": 4.718519329285771e-05,
+      "loss": 0.612,
+      "step": 1717
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.3450289858246148,
+      "learning_rate": 4.707522978363508e-05,
+      "loss": 0.6151,
+      "step": 1718
+    },
+    {
+      "epoch": 0.6876,
+      "grad_norm": 0.3554537104560121,
+      "learning_rate": 4.696535509940498e-05,
+      "loss": 0.5926,
+      "step": 1719
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3700955022172051,
+      "learning_rate": 4.6855569424572955e-05,
+      "loss": 0.5758,
+      "step": 1720
+    },
+    {
+      "epoch": 0.6884,
+      "grad_norm": 0.3521617004846447,
+      "learning_rate": 4.674587294339513e-05,
+      "loss": 0.6001,
+      "step": 1721
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.37920595064499485,
+      "learning_rate": 4.663626583997789e-05,
+      "loss": 0.6172,
+      "step": 1722
+    },
+    {
+      "epoch": 0.6892,
+      "grad_norm": 0.3311674718568459,
+      "learning_rate": 4.652674829827761e-05,
+      "loss": 0.5296,
+      "step": 1723
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.34088698548975355,
+      "learning_rate": 4.6417320502100316e-05,
+      "loss": 0.6063,
+      "step": 1724
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.33469346316607584,
+      "learning_rate": 4.630798263510162e-05,
+      "loss": 0.5783,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.34441486420204004,
+      "learning_rate": 4.6198734880785965e-05,
+      "loss": 0.5876,
+      "step": 1726
+    },
+    {
+      "epoch": 0.6908,
+      "grad_norm": 0.3521399585532281,
+      "learning_rate": 4.608957742250667e-05,
+      "loss": 0.5987,
+      "step": 1727
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3326145599750792,
+      "learning_rate": 4.598051044346542e-05,
+      "loss": 0.5592,
+      "step": 1728
+    },
+    {
+      "epoch": 0.6916,
+      "grad_norm": 0.34024540042317375,
+      "learning_rate": 4.587153412671217e-05,
+      "loss": 0.5718,
+      "step": 1729
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.33676043102294895,
+      "learning_rate": 4.5762648655144666e-05,
+      "loss": 0.6207,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6924,
+      "grad_norm": 0.3585715026248596,
+      "learning_rate": 4.565385421150816e-05,
+      "loss": 0.5834,
+      "step": 1731
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.33116888316468274,
+      "learning_rate": 4.55451509783951e-05,
+      "loss": 0.5826,
+      "step": 1732
+    },
+    {
+      "epoch": 0.6932,
+      "grad_norm": 0.3244130276411306,
+      "learning_rate": 4.543653913824496e-05,
+      "loss": 0.5582,
+      "step": 1733
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.35764178230441473,
+      "learning_rate": 4.53280188733437e-05,
+      "loss": 0.6015,
+      "step": 1734
+    },
+    {
+      "epoch": 0.694,
+      "grad_norm": 0.34451089875575613,
+      "learning_rate": 4.5219590365823714e-05,
+      "loss": 0.5741,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.33862571879533104,
+      "learning_rate": 4.511125379766331e-05,
+      "loss": 0.6311,
+      "step": 1736
+    },
+    {
+      "epoch": 0.6948,
+      "grad_norm": 0.35252409439730675,
+      "learning_rate": 4.5003009350686474e-05,
+      "loss": 0.6392,
+      "step": 1737
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.3598745433163055,
+      "learning_rate": 4.489485720656266e-05,
+      "loss": 0.6006,
+      "step": 1738
+    },
+    {
+      "epoch": 0.6956,
+      "grad_norm": 0.3374253785220396,
+      "learning_rate": 4.478679754680639e-05,
+      "loss": 0.5838,
+      "step": 1739
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3342818500271891,
+      "learning_rate": 4.467883055277695e-05,
+      "loss": 0.6056,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6964,
+      "grad_norm": 0.37278807358701976,
+      "learning_rate": 4.457095640567803e-05,
+      "loss": 0.6245,
+      "step": 1741
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.3601495067664831,
+      "learning_rate": 4.446317528655766e-05,
+      "loss": 0.6093,
+      "step": 1742
+    },
+    {
+      "epoch": 0.6972,
+      "grad_norm": 0.34751045601642516,
+      "learning_rate": 4.435548737630756e-05,
+      "loss": 0.607,
+      "step": 1743
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3435708716862839,
+      "learning_rate": 4.424789285566316e-05,
+      "loss": 0.5987,
+      "step": 1744
+    },
+    {
+      "epoch": 0.698,
+      "grad_norm": 0.33328993609492175,
+      "learning_rate": 4.414039190520308e-05,
+      "loss": 0.6102,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.32839634033697485,
+      "learning_rate": 4.4032984705348845e-05,
+      "loss": 0.5584,
+      "step": 1746
+    },
+    {
+      "epoch": 0.6988,
+      "grad_norm": 0.3260876866685833,
+      "learning_rate": 4.3925671436364804e-05,
+      "loss": 0.6048,
+      "step": 1747
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.33469728950785477,
+      "learning_rate": 4.3818452278357445e-05,
+      "loss": 0.5648,
+      "step": 1748
+    },
+    {
+      "epoch": 0.6996,
+      "grad_norm": 0.3412017526101026,
+      "learning_rate": 4.371132741127553e-05,
+      "loss": 0.5905,
+      "step": 1749
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.3983993751410565,
+      "learning_rate": 4.360429701490934e-05,
+      "loss": 0.5768,
+      "step": 1750
+    },
+    {
+      "epoch": 0.7004,
+      "grad_norm": 0.3611546245259949,
+      "learning_rate": 4.3497361268890834e-05,
+      "loss": 0.641,
+      "step": 1751
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.34650976591793875,
+      "learning_rate": 4.339052035269291e-05,
+      "loss": 0.6096,
+      "step": 1752
+    },
+    {
+      "epoch": 0.7012,
+      "grad_norm": 0.34040200369753926,
+      "learning_rate": 4.328377444562948e-05,
+      "loss": 0.5889,
+      "step": 1753
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.3665520290469235,
+      "learning_rate": 4.3177123726854896e-05,
+      "loss": 0.6222,
+      "step": 1754
+    },
+    {
+      "epoch": 0.702,
+      "grad_norm": 0.34719181249410996,
+      "learning_rate": 4.307056837536373e-05,
+      "loss": 0.5843,
+      "step": 1755
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.3409310044908438,
+      "learning_rate": 4.296410856999062e-05,
+      "loss": 0.5861,
+      "step": 1756
+    },
+    {
+      "epoch": 0.7028,
+      "grad_norm": 0.3281722642151921,
+      "learning_rate": 4.285774448940972e-05,
+      "loss": 0.567,
+      "step": 1757
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.34192081092616117,
+      "learning_rate": 4.275147631213465e-05,
+      "loss": 0.6002,
+      "step": 1758
+    },
+    {
+      "epoch": 0.7036,
+      "grad_norm": 0.3337898835715986,
+      "learning_rate": 4.2645304216517926e-05,
+      "loss": 0.5795,
+      "step": 1759
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.358841520504681,
+      "learning_rate": 4.253922838075095e-05,
+      "loss": 0.6217,
+      "step": 1760
+    },
+    {
+      "epoch": 0.7044,
+      "grad_norm": 0.33134214333215883,
+      "learning_rate": 4.243324898286348e-05,
+      "loss": 0.5686,
+      "step": 1761
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.35510024493667014,
+      "learning_rate": 4.232736620072341e-05,
+      "loss": 0.5418,
+      "step": 1762
+    },
+    {
+      "epoch": 0.7052,
+      "grad_norm": 0.3230400921160106,
+      "learning_rate": 4.222158021203657e-05,
+      "loss": 0.6214,
+      "step": 1763
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.33404401976266973,
+      "learning_rate": 4.2115891194346224e-05,
+      "loss": 0.6302,
+      "step": 1764
+    },
+    {
+      "epoch": 0.706,
+      "grad_norm": 0.3624898248699408,
+      "learning_rate": 4.2010299325033034e-05,
+      "loss": 0.6381,
+      "step": 1765
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.33125456052914526,
+      "learning_rate": 4.1904804781314436e-05,
+      "loss": 0.6065,
+      "step": 1766
+    },
+    {
+      "epoch": 0.7068,
+      "grad_norm": 0.3291357243560303,
+      "learning_rate": 4.179940774024469e-05,
+      "loss": 0.5854,
+      "step": 1767
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.32261390402100604,
+      "learning_rate": 4.169410837871427e-05,
+      "loss": 0.5558,
+      "step": 1768
+    },
+    {
+      "epoch": 0.7076,
+      "grad_norm": 0.3321667286365183,
+      "learning_rate": 4.158890687344986e-05,
+      "loss": 0.5444,
+      "step": 1769
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.33531451395956424,
+      "learning_rate": 4.1483803401013796e-05,
+      "loss": 0.5699,
+      "step": 1770
+    },
+    {
+      "epoch": 0.7084,
+      "grad_norm": 0.35701748113397946,
+      "learning_rate": 4.137879813780388e-05,
+      "loss": 0.5794,
+      "step": 1771
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.33230272416291334,
+      "learning_rate": 4.127389126005319e-05,
+      "loss": 0.5888,
+      "step": 1772
+    },
+    {
+      "epoch": 0.7092,
+      "grad_norm": 0.3468405819430816,
+      "learning_rate": 4.116908294382955e-05,
+      "loss": 0.604,
+      "step": 1773
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.33183066782789583,
+      "learning_rate": 4.10643733650355e-05,
+      "loss": 0.6377,
+      "step": 1774
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.34181576143335957,
+      "learning_rate": 4.0959762699407766e-05,
+      "loss": 0.5729,
+      "step": 1775
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.36846919900828995,
+      "learning_rate": 4.0855251122517056e-05,
+      "loss": 0.6643,
+      "step": 1776
+    },
+    {
+      "epoch": 0.7108,
+      "grad_norm": 0.3434043943463887,
+      "learning_rate": 4.0750838809767875e-05,
+      "loss": 0.631,
+      "step": 1777
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.33783837112536425,
+      "learning_rate": 4.064652593639808e-05,
+      "loss": 0.5896,
+      "step": 1778
+    },
+    {
+      "epoch": 0.7116,
+      "grad_norm": 0.35646109372554274,
+      "learning_rate": 4.0542312677478614e-05,
+      "loss": 0.6301,
+      "step": 1779
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3322128925737181,
+      "learning_rate": 4.043819920791322e-05,
+      "loss": 0.5699,
+      "step": 1780
+    },
+    {
+      "epoch": 0.7124,
+      "grad_norm": 0.3401371606977615,
+      "learning_rate": 4.0334185702438185e-05,
+      "loss": 0.5688,
+      "step": 1781
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.3777176438153494,
+      "learning_rate": 4.0230272335622064e-05,
+      "loss": 0.6604,
+      "step": 1782
+    },
+    {
+      "epoch": 0.7132,
+      "grad_norm": 0.32841489423285625,
+      "learning_rate": 4.012645928186533e-05,
+      "loss": 0.5964,
+      "step": 1783
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.34963614464945214,
+      "learning_rate": 4.002274671540006e-05,
+      "loss": 0.627,
+      "step": 1784
+    },
+    {
+      "epoch": 0.714,
+      "grad_norm": 0.3869157745953258,
+      "learning_rate": 3.991913481028965e-05,
+      "loss": 0.6071,
+      "step": 1785
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.37521248581587024,
+      "learning_rate": 3.981562374042866e-05,
+      "loss": 0.6436,
+      "step": 1786
+    },
+    {
+      "epoch": 0.7148,
+      "grad_norm": 0.32794514427064697,
+      "learning_rate": 3.9712213679542385e-05,
+      "loss": 0.5686,
+      "step": 1787
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3344873846850927,
+      "learning_rate": 3.960890480118653e-05,
+      "loss": 0.5606,
+      "step": 1788
+    },
+    {
+      "epoch": 0.7156,
+      "grad_norm": 0.3545953735117356,
+      "learning_rate": 3.950569727874703e-05,
+      "loss": 0.6344,
+      "step": 1789
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.3552982691824513,
+      "learning_rate": 3.940259128543967e-05,
+      "loss": 0.5792,
+      "step": 1790
+    },
+    {
+      "epoch": 0.7164,
+      "grad_norm": 0.3398457451931291,
+      "learning_rate": 3.92995869943099e-05,
+      "loss": 0.6545,
+      "step": 1791
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3452332071130584,
+      "learning_rate": 3.9196684578232476e-05,
+      "loss": 0.6335,
+      "step": 1792
+    },
+    {
+      "epoch": 0.7172,
+      "grad_norm": 0.32510328040183545,
+      "learning_rate": 3.9093884209911134e-05,
+      "loss": 0.585,
+      "step": 1793
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.34076165773645006,
+      "learning_rate": 3.8991186061878314e-05,
+      "loss": 0.5706,
+      "step": 1794
+    },
+    {
+      "epoch": 0.718,
+      "grad_norm": 0.3421915886836552,
+      "learning_rate": 3.8888590306494974e-05,
+      "loss": 0.5821,
+      "step": 1795
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.36172255238718126,
+      "learning_rate": 3.8786097115950214e-05,
+      "loss": 0.6032,
+      "step": 1796
+    },
+    {
+      "epoch": 0.7188,
+      "grad_norm": 0.3353589426405682,
+      "learning_rate": 3.868370666226094e-05,
+      "loss": 0.5814,
+      "step": 1797
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.32584203607046014,
+      "learning_rate": 3.858141911727168e-05,
+      "loss": 0.5738,
+      "step": 1798
+    },
+    {
+      "epoch": 0.7196,
+      "grad_norm": 0.34155613523147377,
+      "learning_rate": 3.8479234652654175e-05,
+      "loss": 0.5847,
+      "step": 1799
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.36998435870649954,
+      "learning_rate": 3.8377153439907266e-05,
+      "loss": 0.5652,
+      "step": 1800
+    },
+    {
+      "epoch": 0.7204,
+      "grad_norm": 0.34777455503922344,
+      "learning_rate": 3.8275175650356485e-05,
+      "loss": 0.6085,
+      "step": 1801
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.33243987266833946,
+      "learning_rate": 3.817330145515374e-05,
+      "loss": 0.5815,
+      "step": 1802
+    },
+    {
+      "epoch": 0.7212,
+      "grad_norm": 0.3911085821400549,
+      "learning_rate": 3.807153102527704e-05,
+      "loss": 0.5978,
+      "step": 1803
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3707021613874098,
+      "learning_rate": 3.7969864531530344e-05,
+      "loss": 0.5883,
+      "step": 1804
+    },
+    {
+      "epoch": 0.722,
+      "grad_norm": 0.3798153645094958,
+      "learning_rate": 3.786830214454315e-05,
+      "loss": 0.6166,
+      "step": 1805
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.34308854005452744,
+      "learning_rate": 3.776684403477015e-05,
+      "loss": 0.5656,
+      "step": 1806
+    },
+    {
+      "epoch": 0.7228,
+      "grad_norm": 0.36171966315223114,
+      "learning_rate": 3.766549037249112e-05,
+      "loss": 0.6234,
+      "step": 1807
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.34254198497350036,
+      "learning_rate": 3.756424132781043e-05,
+      "loss": 0.561,
+      "step": 1808
+    },
+    {
+      "epoch": 0.7236,
+      "grad_norm": 0.3493856315987089,
+      "learning_rate": 3.7463097070657e-05,
+      "loss": 0.589,
+      "step": 1809
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.3294289895291501,
+      "learning_rate": 3.736205777078381e-05,
+      "loss": 0.5617,
+      "step": 1810
+    },
+    {
+      "epoch": 0.7244,
+      "grad_norm": 0.38191187657568854,
+      "learning_rate": 3.72611235977677e-05,
+      "loss": 0.5502,
+      "step": 1811
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.7463582450918105,
+      "learning_rate": 3.716029472100903e-05,
+      "loss": 0.6148,
+      "step": 1812
+    },
+    {
+      "epoch": 0.7252,
+      "grad_norm": 0.3627308277690777,
+      "learning_rate": 3.705957130973149e-05,
+      "loss": 0.6106,
+      "step": 1813
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.3509859534297781,
+      "learning_rate": 3.69589535329818e-05,
+      "loss": 0.5607,
+      "step": 1814
+    },
+    {
+      "epoch": 0.726,
+      "grad_norm": 0.3570227999609146,
+      "learning_rate": 3.6858441559629306e-05,
+      "loss": 0.5917,
+      "step": 1815
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3211693703387772,
+      "learning_rate": 3.6758035558365825e-05,
+      "loss": 0.6232,
+      "step": 1816
+    },
+    {
+      "epoch": 0.7268,
+      "grad_norm": 0.3297213799425913,
+      "learning_rate": 3.665773569770526e-05,
+      "loss": 0.559,
+      "step": 1817
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.34321841832562183,
+      "learning_rate": 3.655754214598349e-05,
+      "loss": 0.5917,
+      "step": 1818
+    },
+    {
+      "epoch": 0.7276,
+      "grad_norm": 0.3439005678265292,
+      "learning_rate": 3.6457455071357916e-05,
+      "loss": 0.6591,
+      "step": 1819
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.32660589437964493,
+      "learning_rate": 3.63574746418072e-05,
+      "loss": 0.5578,
+      "step": 1820
+    },
+    {
+      "epoch": 0.7284,
+      "grad_norm": 0.3443583477589589,
+      "learning_rate": 3.6257601025131026e-05,
+      "loss": 0.6174,
+      "step": 1821
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.3866299288827764,
+      "learning_rate": 3.615783438894991e-05,
+      "loss": 0.6492,
+      "step": 1822
+    },
+    {
+      "epoch": 0.7292,
+      "grad_norm": 0.392867925507431,
+      "learning_rate": 3.605817490070464e-05,
+      "loss": 0.6031,
+      "step": 1823
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.34054806404404225,
+      "learning_rate": 3.595862272765638e-05,
+      "loss": 0.5955,
+      "step": 1824
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.36146903190102,
+      "learning_rate": 3.585917803688603e-05,
+      "loss": 0.5402,
+      "step": 1825
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.3420451815961708,
+      "learning_rate": 3.575984099529414e-05,
+      "loss": 0.6383,
+      "step": 1826
+    },
+    {
+      "epoch": 0.7308,
+      "grad_norm": 0.3163032617663247,
+      "learning_rate": 3.56606117696006e-05,
+      "loss": 0.5998,
+      "step": 1827
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3507263830427564,
+      "learning_rate": 3.556149052634443e-05,
+      "loss": 0.5836,
+      "step": 1828
+    },
+    {
+      "epoch": 0.7316,
+      "grad_norm": 0.34326890644825564,
+      "learning_rate": 3.546247743188328e-05,
+      "loss": 0.5431,
+      "step": 1829
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.33205724492765964,
+      "learning_rate": 3.5363572652393326e-05,
+      "loss": 0.6209,
+      "step": 1830
+    },
+    {
+      "epoch": 0.7324,
+      "grad_norm": 0.3166654437001939,
+      "learning_rate": 3.526477635386904e-05,
+      "loss": 0.559,
+      "step": 1831
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3514293830851197,
+      "learning_rate": 3.5166088702122736e-05,
+      "loss": 0.6157,
+      "step": 1832
+    },
+    {
+      "epoch": 0.7332,
+      "grad_norm": 0.34952287180088665,
+      "learning_rate": 3.5067509862784454e-05,
+      "loss": 0.5588,
+      "step": 1833
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.37518819397486813,
+      "learning_rate": 3.496904000130151e-05,
+      "loss": 0.5552,
+      "step": 1834
+    },
+    {
+      "epoch": 0.734,
+      "grad_norm": 0.3316850789388843,
+      "learning_rate": 3.487067928293848e-05,
+      "loss": 0.6146,
+      "step": 1835
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.3422435199619646,
+      "learning_rate": 3.47724278727766e-05,
+      "loss": 0.6564,
+      "step": 1836
+    },
+    {
+      "epoch": 0.7348,
+      "grad_norm": 0.33318729202355385,
+      "learning_rate": 3.467428593571371e-05,
+      "loss": 0.5803,
+      "step": 1837
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.3548680033208881,
+      "learning_rate": 3.457625363646399e-05,
+      "loss": 0.6029,
+      "step": 1838
+    },
+    {
+      "epoch": 0.7356,
+      "grad_norm": 0.3668029311270937,
+      "learning_rate": 3.447833113955748e-05,
+      "loss": 0.6209,
+      "step": 1839
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.37981508010159476,
+      "learning_rate": 3.4380518609340076e-05,
+      "loss": 0.6044,
+      "step": 1840
+    },
+    {
+      "epoch": 0.7364,
+      "grad_norm": 0.31997437312829297,
+      "learning_rate": 3.4282816209972956e-05,
+      "loss": 0.6278,
+      "step": 1841
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.3719363008912083,
+      "learning_rate": 3.4185224105432656e-05,
+      "loss": 0.6127,
+      "step": 1842
+    },
+    {
+      "epoch": 0.7372,
+      "grad_norm": 0.35338133590475707,
+      "learning_rate": 3.40877424595104e-05,
+      "loss": 0.5807,
+      "step": 1843
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.34220788950704023,
+      "learning_rate": 3.3990371435812187e-05,
+      "loss": 0.5847,
+      "step": 1844
+    },
+    {
+      "epoch": 0.738,
+      "grad_norm": 0.3466991009204236,
+      "learning_rate": 3.389311119775828e-05,
+      "loss": 0.5966,
+      "step": 1845
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.38253275922104424,
+      "learning_rate": 3.379596190858296e-05,
+      "loss": 0.6273,
+      "step": 1846
+    },
+    {
+      "epoch": 0.7388,
+      "grad_norm": 0.35428621430519175,
+      "learning_rate": 3.3698923731334453e-05,
+      "loss": 0.5979,
+      "step": 1847
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3364181606731447,
+      "learning_rate": 3.3601996828874326e-05,
+      "loss": 0.6358,
+      "step": 1848
+    },
+    {
+      "epoch": 0.7396,
+      "grad_norm": 0.3383820336858019,
+      "learning_rate": 3.3505181363877535e-05,
+      "loss": 0.5772,
+      "step": 1849
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.3567661646983929,
+      "learning_rate": 3.340847749883191e-05,
+      "loss": 0.6294,
+      "step": 1850
+    },
+    {
+      "epoch": 0.7404,
+      "grad_norm": 0.35059920283725304,
+      "learning_rate": 3.3311885396038e-05,
+      "loss": 0.5829,
+      "step": 1851
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3359809129607562,
+      "learning_rate": 3.321540521760883e-05,
+      "loss": 0.619,
+      "step": 1852
+    },
+    {
+      "epoch": 0.7412,
+      "grad_norm": 0.35364262160446985,
+      "learning_rate": 3.3119037125469554e-05,
+      "loss": 0.5866,
+      "step": 1853
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.3368427844750738,
+      "learning_rate": 3.3022781281357186e-05,
+      "loss": 0.525,
+      "step": 1854
+    },
+    {
+      "epoch": 0.742,
+      "grad_norm": 0.34779304005488076,
+      "learning_rate": 3.292663784682036e-05,
+      "loss": 0.5958,
+      "step": 1855
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3610464453397608,
+      "learning_rate": 3.2830606983219033e-05,
+      "loss": 0.5725,
+      "step": 1856
+    },
+    {
+      "epoch": 0.7428,
+      "grad_norm": 0.34493718175933263,
+      "learning_rate": 3.2734688851724274e-05,
+      "loss": 0.5806,
+      "step": 1857
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.34079854407821586,
+      "learning_rate": 3.2638883613317974e-05,
+      "loss": 0.5926,
+      "step": 1858
+    },
+    {
+      "epoch": 0.7436,
+      "grad_norm": 0.3374882582749612,
+      "learning_rate": 3.2543191428792465e-05,
+      "loss": 0.6039,
+      "step": 1859
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3423983829033276,
+      "learning_rate": 3.2447612458750365e-05,
+      "loss": 0.5932,
+      "step": 1860
+    },
+    {
+      "epoch": 0.7444,
+      "grad_norm": 0.36799890179009165,
+      "learning_rate": 3.235214686360432e-05,
+      "loss": 0.6211,
+      "step": 1861
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.39461159177466326,
+      "learning_rate": 3.2256794803576704e-05,
+      "loss": 0.6198,
+      "step": 1862
+    },
+    {
+      "epoch": 0.7452,
+      "grad_norm": 0.3395489404640023,
+      "learning_rate": 3.21615564386993e-05,
+      "loss": 0.5558,
+      "step": 1863
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3431196916316968,
+      "learning_rate": 3.206643192881307e-05,
+      "loss": 0.5663,
+      "step": 1864
+    },
+    {
+      "epoch": 0.746,
+      "grad_norm": 0.34561539999980645,
+      "learning_rate": 3.197142143356787e-05,
+      "loss": 0.5618,
+      "step": 1865
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.3343052053046587,
+      "learning_rate": 3.1876525112422286e-05,
+      "loss": 0.5943,
+      "step": 1866
+    },
+    {
+      "epoch": 0.7468,
+      "grad_norm": 0.35098310618615475,
+      "learning_rate": 3.178174312464326e-05,
+      "loss": 0.6065,
+      "step": 1867
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.3766296479216891,
+      "learning_rate": 3.1687075629305786e-05,
+      "loss": 0.5428,
+      "step": 1868
+    },
+    {
+      "epoch": 0.7476,
+      "grad_norm": 0.3501028408374377,
+      "learning_rate": 3.159252278529271e-05,
+      "loss": 0.5944,
+      "step": 1869
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.33062667658635936,
+      "learning_rate": 3.149808475129452e-05,
+      "loss": 0.5581,
+      "step": 1870
+    },
+    {
+      "epoch": 0.7484,
+      "grad_norm": 0.35539241907167646,
+      "learning_rate": 3.140376168580901e-05,
+      "loss": 0.6214,
+      "step": 1871
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3341801415583605,
+      "learning_rate": 3.130955374714094e-05,
+      "loss": 0.6086,
+      "step": 1872
+    },
+    {
+      "epoch": 0.7492,
+      "grad_norm": 0.3542411336205624,
+      "learning_rate": 3.121546109340191e-05,
+      "loss": 0.6504,
+      "step": 1873
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.34736361535409627,
+      "learning_rate": 3.112148388250999e-05,
+      "loss": 0.6532,
+      "step": 1874
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.3810169767837099,
+      "learning_rate": 3.102762227218957e-05,
+      "loss": 0.6157,
+      "step": 1875
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.35087605354461415,
+      "learning_rate": 3.093387641997101e-05,
+      "loss": 0.5933,
+      "step": 1876
+    },
+    {
+      "epoch": 0.7508,
+      "grad_norm": 0.3185970759422581,
+      "learning_rate": 3.084024648319034e-05,
+      "loss": 0.5531,
+      "step": 1877
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.3669794767755098,
+      "learning_rate": 3.074673261898903e-05,
+      "loss": 0.6106,
+      "step": 1878
+    },
+    {
+      "epoch": 0.7516,
+      "grad_norm": 0.3496011247614334,
+      "learning_rate": 3.0653334984313806e-05,
+      "loss": 0.6039,
+      "step": 1879
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3222656267876488,
+      "learning_rate": 3.056005373591637e-05,
+      "loss": 0.5845,
+      "step": 1880
+    },
+    {
+      "epoch": 0.7524,
+      "grad_norm": 0.35643373977727705,
+      "learning_rate": 3.0466889030352973e-05,
+      "loss": 0.5773,
+      "step": 1881
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.3541552847448694,
+      "learning_rate": 3.0373841023984306e-05,
+      "loss": 0.6217,
+      "step": 1882
+    },
+    {
+      "epoch": 0.7532,
+      "grad_norm": 0.34768359118936737,
+      "learning_rate": 3.0280909872975193e-05,
+      "loss": 0.5923,
+      "step": 1883
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.32586377284940893,
+      "learning_rate": 3.0188095733294386e-05,
+      "loss": 0.5702,
+      "step": 1884
+    },
+    {
+      "epoch": 0.754,
+      "grad_norm": 0.36000759146785977,
+      "learning_rate": 3.0095398760714267e-05,
+      "loss": 0.6107,
+      "step": 1885
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.35031617975658125,
+      "learning_rate": 3.0002819110810475e-05,
+      "loss": 0.6063,
+      "step": 1886
+    },
+    {
+      "epoch": 0.7548,
+      "grad_norm": 0.7165571029926521,
+      "learning_rate": 2.9910356938961782e-05,
+      "loss": 0.5694,
+      "step": 1887
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3615817705066542,
+      "learning_rate": 2.981801240034985e-05,
+      "loss": 0.5888,
+      "step": 1888
+    },
+    {
+      "epoch": 0.7556,
+      "grad_norm": 0.3516248634035183,
+      "learning_rate": 2.9725785649958892e-05,
+      "loss": 0.6212,
+      "step": 1889
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.3355532898864876,
+      "learning_rate": 2.9633676842575387e-05,
+      "loss": 0.5598,
+      "step": 1890
+    },
+    {
+      "epoch": 0.7564,
+      "grad_norm": 0.370978793994998,
+      "learning_rate": 2.9541686132787905e-05,
+      "loss": 0.6965,
+      "step": 1891
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.34118286549897464,
+      "learning_rate": 2.944981367498677e-05,
+      "loss": 0.5536,
+      "step": 1892
+    },
+    {
+      "epoch": 0.7572,
+      "grad_norm": 0.35082758802996605,
+      "learning_rate": 2.93580596233639e-05,
+      "loss": 0.5757,
+      "step": 1893
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.35326206507041236,
+      "learning_rate": 2.9266424131912497e-05,
+      "loss": 0.5663,
+      "step": 1894
+    },
+    {
+      "epoch": 0.758,
+      "grad_norm": 0.358877536846838,
+      "learning_rate": 2.9174907354426696e-05,
+      "loss": 0.6088,
+      "step": 1895
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.36721273543101884,
+      "learning_rate": 2.9083509444501432e-05,
+      "loss": 0.5916,
+      "step": 1896
+    },
+    {
+      "epoch": 0.7588,
+      "grad_norm": 0.34884960136342424,
+      "learning_rate": 2.899223055553221e-05,
+      "loss": 0.5862,
+      "step": 1897
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.3315108311402396,
+      "learning_rate": 2.890107084071465e-05,
+      "loss": 0.5436,
+      "step": 1898
+    },
+    {
+      "epoch": 0.7596,
+      "grad_norm": 0.35488601916656104,
+      "learning_rate": 2.8810030453044478e-05,
+      "loss": 0.6222,
+      "step": 1899
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.34667998487899204,
+      "learning_rate": 2.8719109545317103e-05,
+      "loss": 0.5571,
+      "step": 1900
+    },
+    {
+      "epoch": 0.7604,
+      "grad_norm": 0.33645514984848884,
+      "learning_rate": 2.8628308270127335e-05,
+      "loss": 0.5897,
+      "step": 1901
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.41888725813179317,
+      "learning_rate": 2.853762677986932e-05,
+      "loss": 0.61,
+      "step": 1902
+    },
+    {
+      "epoch": 0.7612,
+      "grad_norm": 0.33913218882422785,
+      "learning_rate": 2.844706522673616e-05,
+      "loss": 0.5864,
+      "step": 1903
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.36078459614959707,
+      "learning_rate": 2.835662376271957e-05,
+      "loss": 0.5586,
+      "step": 1904
+    },
+    {
+      "epoch": 0.762,
+      "grad_norm": 0.3841008489119458,
+      "learning_rate": 2.8266302539609745e-05,
+      "loss": 0.551,
+      "step": 1905
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.35889182134957104,
+      "learning_rate": 2.817610170899517e-05,
+      "loss": 0.603,
+      "step": 1906
+    },
+    {
+      "epoch": 0.7628,
+      "grad_norm": 0.3304056752254963,
+      "learning_rate": 2.8086021422262122e-05,
+      "loss": 0.59,
+      "step": 1907
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3398864641550042,
+      "learning_rate": 2.7996061830594712e-05,
+      "loss": 0.5649,
+      "step": 1908
+    },
+    {
+      "epoch": 0.7636,
+      "grad_norm": 0.33997270611237673,
+      "learning_rate": 2.7906223084974403e-05,
+      "loss": 0.5774,
+      "step": 1909
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.358760540859238,
+      "learning_rate": 2.7816505336179798e-05,
+      "loss": 0.6156,
+      "step": 1910
+    },
+    {
+      "epoch": 0.7644,
+      "grad_norm": 0.3324780598022909,
+      "learning_rate": 2.772690873478656e-05,
+      "loss": 0.5653,
+      "step": 1911
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3545338643402483,
+      "learning_rate": 2.7637433431166903e-05,
+      "loss": 0.5645,
+      "step": 1912
+    },
+    {
+      "epoch": 0.7652,
+      "grad_norm": 0.3314663072625293,
+      "learning_rate": 2.754807957548955e-05,
+      "loss": 0.6198,
+      "step": 1913
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.33597655025069056,
+      "learning_rate": 2.7458847317719305e-05,
+      "loss": 0.5753,
+      "step": 1914
+    },
+    {
+      "epoch": 0.766,
+      "grad_norm": 0.5223302936269263,
+      "learning_rate": 2.736973680761702e-05,
+      "loss": 0.5595,
+      "step": 1915
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.35334424448491386,
+      "learning_rate": 2.728074819473908e-05,
+      "loss": 0.5968,
+      "step": 1916
+    },
+    {
+      "epoch": 0.7668,
+      "grad_norm": 0.31679051950561166,
+      "learning_rate": 2.7191881628437333e-05,
+      "loss": 0.5927,
+      "step": 1917
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.35835588969750193,
+      "learning_rate": 2.7103137257858868e-05,
+      "loss": 0.6096,
+      "step": 1918
+    },
+    {
+      "epoch": 0.7676,
+      "grad_norm": 0.39784975047738036,
+      "learning_rate": 2.7014515231945557e-05,
+      "loss": 0.6012,
+      "step": 1919
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3418078968959598,
+      "learning_rate": 2.6926015699434072e-05,
+      "loss": 0.6177,
+      "step": 1920
+    },
+    {
+      "epoch": 0.7684,
+      "grad_norm": 0.3278648428829672,
+      "learning_rate": 2.683763880885538e-05,
+      "loss": 0.5762,
+      "step": 1921
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.35906205351496273,
+      "learning_rate": 2.674938470853472e-05,
+      "loss": 0.5897,
+      "step": 1922
+    },
+    {
+      "epoch": 0.7692,
+      "grad_norm": 0.34729834198526843,
+      "learning_rate": 2.6661253546591157e-05,
+      "loss": 0.5416,
+      "step": 1923
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.35561882001773815,
+      "learning_rate": 2.6573245470937523e-05,
+      "loss": 0.6232,
+      "step": 1924
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.345277417829143,
+      "learning_rate": 2.6485360629279987e-05,
+      "loss": 0.5967,
+      "step": 1925
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.4115697957958416,
+      "learning_rate": 2.639759916911788e-05,
+      "loss": 0.612,
+      "step": 1926
+    },
+    {
+      "epoch": 0.7708,
+      "grad_norm": 0.33966309723785676,
+      "learning_rate": 2.6309961237743585e-05,
+      "loss": 0.618,
+      "step": 1927
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3302196082114761,
+      "learning_rate": 2.6222446982242e-05,
+      "loss": 0.5555,
+      "step": 1928
+    },
+    {
+      "epoch": 0.7716,
+      "grad_norm": 0.34908164626731897,
+      "learning_rate": 2.61350565494906e-05,
+      "loss": 0.5759,
+      "step": 1929
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.3270119125339375,
+      "learning_rate": 2.6047790086158952e-05,
+      "loss": 0.5945,
+      "step": 1930
+    },
+    {
+      "epoch": 0.7724,
+      "grad_norm": 0.32270857553503557,
+      "learning_rate": 2.5960647738708555e-05,
+      "loss": 0.5769,
+      "step": 1931
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4308707041004166,
+      "learning_rate": 2.587362965339265e-05,
+      "loss": 0.6266,
+      "step": 1932
+    },
+    {
+      "epoch": 0.7732,
+      "grad_norm": 0.3334837630486001,
+      "learning_rate": 2.5786735976255973e-05,
+      "loss": 0.5902,
+      "step": 1933
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.352081859909484,
+      "learning_rate": 2.5699966853134337e-05,
+      "loss": 0.5881,
+      "step": 1934
+    },
+    {
+      "epoch": 0.774,
+      "grad_norm": 0.4876058517345553,
+      "learning_rate": 2.5613322429654574e-05,
+      "loss": 0.5469,
+      "step": 1935
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.34472447926214234,
+      "learning_rate": 2.5526802851234268e-05,
+      "loss": 0.6236,
+      "step": 1936
+    },
+    {
+      "epoch": 0.7748,
+      "grad_norm": 0.3429335963910842,
+      "learning_rate": 2.5440408263081382e-05,
+      "loss": 0.6022,
+      "step": 1937
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.3525901489542083,
+      "learning_rate": 2.5354138810194226e-05,
+      "loss": 0.5891,
+      "step": 1938
+    },
+    {
+      "epoch": 0.7756,
+      "grad_norm": 0.3390282795907182,
+      "learning_rate": 2.5267994637360993e-05,
+      "loss": 0.5652,
+      "step": 1939
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3288355057563831,
+      "learning_rate": 2.5181975889159615e-05,
+      "loss": 0.5666,
+      "step": 1940
+    },
+    {
+      "epoch": 0.7764,
+      "grad_norm": 0.3259738427143579,
+      "learning_rate": 2.509608270995758e-05,
+      "loss": 0.5871,
+      "step": 1941
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.3320669275361078,
+      "learning_rate": 2.501031524391163e-05,
+      "loss": 0.5315,
+      "step": 1942
+    },
+    {
+      "epoch": 0.7772,
+      "grad_norm": 0.3604358444439062,
+      "learning_rate": 2.4924673634967466e-05,
+      "loss": 0.6197,
+      "step": 1943
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.34102281009712276,
+      "learning_rate": 2.4839158026859587e-05,
+      "loss": 0.6,
+      "step": 1944
+    },
+    {
+      "epoch": 0.778,
+      "grad_norm": 0.3234797014190219,
+      "learning_rate": 2.475376856311097e-05,
+      "loss": 0.5779,
+      "step": 1945
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.31951513180750857,
+      "learning_rate": 2.4668505387033026e-05,
+      "loss": 0.5645,
+      "step": 1946
+    },
+    {
+      "epoch": 0.7788,
+      "grad_norm": 0.3530404230204508,
+      "learning_rate": 2.4583368641725078e-05,
+      "loss": 0.5602,
+      "step": 1947
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.32186540414463677,
+      "learning_rate": 2.44983584700743e-05,
+      "loss": 0.6049,
+      "step": 1948
+    },
+    {
+      "epoch": 0.7796,
+      "grad_norm": 0.3545125860463158,
+      "learning_rate": 2.4413475014755393e-05,
+      "loss": 0.5902,
+      "step": 1949
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.3482011501090206,
+      "learning_rate": 2.432871841823047e-05,
+      "loss": 0.5999,
+      "step": 1950
+    },
+    {
+      "epoch": 0.7804,
+      "grad_norm": 0.36112621390059313,
+      "learning_rate": 2.42440888227487e-05,
+      "loss": 0.568,
+      "step": 1951
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.37731093017360207,
+      "learning_rate": 2.4159586370346088e-05,
+      "loss": 0.6142,
+      "step": 1952
+    },
+    {
+      "epoch": 0.7812,
+      "grad_norm": 0.3747022497765928,
+      "learning_rate": 2.4075211202845227e-05,
+      "loss": 0.5944,
+      "step": 1953
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.3555496374811139,
+      "learning_rate": 2.3990963461855075e-05,
+      "loss": 0.6317,
+      "step": 1954
+    },
+    {
+      "epoch": 0.782,
+      "grad_norm": 0.3407273756582039,
+      "learning_rate": 2.3906843288770886e-05,
+      "loss": 0.5854,
+      "step": 1955
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.3440908624146322,
+      "learning_rate": 2.3822850824773625e-05,
+      "loss": 0.5426,
+      "step": 1956
+    },
+    {
+      "epoch": 0.7828,
+      "grad_norm": 0.33841018116361676,
+      "learning_rate": 2.3738986210829993e-05,
+      "loss": 0.55,
+      "step": 1957
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.33270722732089786,
+      "learning_rate": 2.3655249587692073e-05,
+      "loss": 0.5519,
+      "step": 1958
+    },
+    {
+      "epoch": 0.7836,
+      "grad_norm": 0.3618477372722039,
+      "learning_rate": 2.3571641095897223e-05,
+      "loss": 0.5556,
+      "step": 1959
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3352707046874701,
+      "learning_rate": 2.3488160875767717e-05,
+      "loss": 0.5609,
+      "step": 1960
+    },
+    {
+      "epoch": 0.7844,
+      "grad_norm": 0.3827338795032483,
+      "learning_rate": 2.3404809067410525e-05,
+      "loss": 0.6086,
+      "step": 1961
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.3785897101425981,
+      "learning_rate": 2.3321585810717117e-05,
+      "loss": 0.6115,
+      "step": 1962
+    },
+    {
+      "epoch": 0.7852,
+      "grad_norm": 0.32939407360776796,
+      "learning_rate": 2.3238491245363147e-05,
+      "loss": 0.6083,
+      "step": 1963
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.3465189710411792,
+      "learning_rate": 2.315552551080845e-05,
+      "loss": 0.5899,
+      "step": 1964
+    },
+    {
+      "epoch": 0.786,
+      "grad_norm": 0.3585136683243423,
+      "learning_rate": 2.307268874629649e-05,
+      "loss": 0.6303,
+      "step": 1965
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.35488822785063706,
+      "learning_rate": 2.2989981090854305e-05,
+      "loss": 0.5763,
+      "step": 1966
+    },
+    {
+      "epoch": 0.7868,
+      "grad_norm": 0.36232148088412175,
+      "learning_rate": 2.290740268329227e-05,
+      "loss": 0.607,
+      "step": 1967
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.34905703501076474,
+      "learning_rate": 2.282495366220383e-05,
+      "loss": 0.5998,
+      "step": 1968
+    },
+    {
+      "epoch": 0.7876,
+      "grad_norm": 0.3343963766216138,
+      "learning_rate": 2.2742634165965316e-05,
+      "loss": 0.5806,
+      "step": 1969
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.33554182680707184,
+      "learning_rate": 2.266044433273562e-05,
+      "loss": 0.5434,
+      "step": 1970
+    },
+    {
+      "epoch": 0.7884,
+      "grad_norm": 0.3442527655293528,
+      "learning_rate": 2.2578384300456014e-05,
+      "loss": 0.5984,
+      "step": 1971
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3454136604067078,
+      "learning_rate": 2.249645420684998e-05,
+      "loss": 0.5778,
+      "step": 1972
+    },
+    {
+      "epoch": 0.7892,
+      "grad_norm": 0.3385411124771602,
+      "learning_rate": 2.2414654189422847e-05,
+      "loss": 0.589,
+      "step": 1973
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.33037457311455093,
+      "learning_rate": 2.233298438546172e-05,
+      "loss": 0.6205,
+      "step": 1974
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.34447887093263985,
+      "learning_rate": 2.2251444932035094e-05,
+      "loss": 0.5498,
+      "step": 1975
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.33162742401137485,
+      "learning_rate": 2.2170035965992675e-05,
+      "loss": 0.5917,
+      "step": 1976
+    },
+    {
+      "epoch": 0.7908,
+      "grad_norm": 0.34456838936680356,
+      "learning_rate": 2.2088757623965262e-05,
+      "loss": 0.6382,
+      "step": 1977
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.3423977987340702,
+      "learning_rate": 2.2007610042364336e-05,
+      "loss": 0.5645,
+      "step": 1978
+    },
+    {
+      "epoch": 0.7916,
+      "grad_norm": 0.33175868418516596,
+      "learning_rate": 2.1926593357381996e-05,
+      "loss": 0.5869,
+      "step": 1979
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3259637184455592,
+      "learning_rate": 2.184570770499056e-05,
+      "loss": 0.595,
+      "step": 1980
+    },
+    {
+      "epoch": 0.7924,
+      "grad_norm": 0.33487038070045444,
+      "learning_rate": 2.176495322094254e-05,
+      "loss": 0.5693,
+      "step": 1981
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.40524246973378053,
+      "learning_rate": 2.1684330040770183e-05,
+      "loss": 0.6128,
+      "step": 1982
+    },
+    {
+      "epoch": 0.7932,
+      "grad_norm": 0.32186782971805505,
+      "learning_rate": 2.1603838299785484e-05,
+      "loss": 0.5763,
+      "step": 1983
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.361702443538986,
+      "learning_rate": 2.1523478133079777e-05,
+      "loss": 0.6091,
+      "step": 1984
+    },
+    {
+      "epoch": 0.794,
+      "grad_norm": 0.36141663884275566,
+      "learning_rate": 2.1443249675523536e-05,
+      "loss": 0.6281,
+      "step": 1985
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.36232498191397317,
+      "learning_rate": 2.1363153061766294e-05,
+      "loss": 0.559,
+      "step": 1986
+    },
+    {
+      "epoch": 0.7948,
+      "grad_norm": 0.32724249560950086,
+      "learning_rate": 2.1283188426236178e-05,
+      "loss": 0.5145,
+      "step": 1987
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.34170056116195524,
+      "learning_rate": 2.1203355903139933e-05,
+      "loss": 0.5922,
+      "step": 1988
+    },
+    {
+      "epoch": 0.7956,
+      "grad_norm": 0.3310127243944117,
+      "learning_rate": 2.112365562646248e-05,
+      "loss": 0.5938,
+      "step": 1989
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.37493958928888,
+      "learning_rate": 2.1044087729966856e-05,
+      "loss": 0.5968,
+      "step": 1990
+    },
+    {
+      "epoch": 0.7964,
+      "grad_norm": 0.33613546556127055,
+      "learning_rate": 2.096465234719389e-05,
+      "loss": 0.5768,
+      "step": 1991
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3325727050763233,
+      "learning_rate": 2.0885349611461967e-05,
+      "loss": 0.5461,
+      "step": 1992
+    },
+    {
+      "epoch": 0.7972,
+      "grad_norm": 0.33725501876277636,
+      "learning_rate": 2.0806179655866966e-05,
+      "loss": 0.5623,
+      "step": 1993
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.3633590484486614,
+      "learning_rate": 2.072714261328177e-05,
+      "loss": 0.5884,
+      "step": 1994
+    },
+    {
+      "epoch": 0.798,
+      "grad_norm": 0.3764466040216392,
+      "learning_rate": 2.0648238616356332e-05,
+      "loss": 0.5921,
+      "step": 1995
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.3145057249043822,
+      "learning_rate": 2.0569467797517173e-05,
+      "loss": 0.5603,
+      "step": 1996
+    },
+    {
+      "epoch": 0.7988,
+      "grad_norm": 0.324959610351305,
+      "learning_rate": 2.0490830288967444e-05,
+      "loss": 0.5872,
+      "step": 1997
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.340462206888166,
+      "learning_rate": 2.0412326222686418e-05,
+      "loss": 0.5936,
+      "step": 1998
+    },
+    {
+      "epoch": 0.7996,
+      "grad_norm": 0.34466338871786273,
+      "learning_rate": 2.033395573042952e-05,
+      "loss": 0.6344,
+      "step": 1999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3300321247103113,
+      "learning_rate": 2.025571894372794e-05,
+      "loss": 0.5504,
+      "step": 2000
+    },
+    {
+      "epoch": 0.8004,
+      "grad_norm": 0.33346644760322913,
+      "learning_rate": 2.0177615993888422e-05,
+      "loss": 0.5694,
+      "step": 2001
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.3438563421476556,
+      "learning_rate": 2.0099647011993216e-05,
+      "loss": 0.5771,
+      "step": 2002
+    },
+    {
+      "epoch": 0.8012,
+      "grad_norm": 0.3429622590645044,
+      "learning_rate": 2.00218121288996e-05,
+      "loss": 0.5867,
+      "step": 2003
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.3375130326381454,
+      "learning_rate": 1.9944111475239867e-05,
+      "loss": 0.5606,
+      "step": 2004
+    },
+    {
+      "epoch": 0.802,
+      "grad_norm": 0.3532062037107467,
+      "learning_rate": 1.9866545181421013e-05,
+      "loss": 0.639,
+      "step": 2005
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.3377893264435425,
+      "learning_rate": 1.9789113377624502e-05,
+      "loss": 0.5882,
+      "step": 2006
+    },
+    {
+      "epoch": 0.8028,
+      "grad_norm": 0.35031490365634965,
+      "learning_rate": 1.971181619380611e-05,
+      "loss": 0.5503,
+      "step": 2007
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.34602423861902754,
+      "learning_rate": 1.963465375969572e-05,
+      "loss": 0.5966,
+      "step": 2008
+    },
+    {
+      "epoch": 0.8036,
+      "grad_norm": 0.36039315536662786,
+      "learning_rate": 1.9557626204796986e-05,
+      "loss": 0.6479,
+      "step": 2009
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.3340153632783169,
+      "learning_rate": 1.9480733658387175e-05,
+      "loss": 0.5855,
+      "step": 2010
+    },
+    {
+      "epoch": 0.8044,
+      "grad_norm": 0.32314770029857476,
+      "learning_rate": 1.9403976249517085e-05,
+      "loss": 0.5736,
+      "step": 2011
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.34818143552495107,
+      "learning_rate": 1.9327354107010566e-05,
+      "loss": 0.5693,
+      "step": 2012
+    },
+    {
+      "epoch": 0.8052,
+      "grad_norm": 0.338316754813434,
+      "learning_rate": 1.9250867359464576e-05,
+      "loss": 0.5862,
+      "step": 2013
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.3454876898375605,
+      "learning_rate": 1.9174516135248744e-05,
+      "loss": 0.5996,
+      "step": 2014
+    },
+    {
+      "epoch": 0.806,
+      "grad_norm": 0.3445315276335558,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.6002,
+      "step": 2015
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3496144771363773,
+      "learning_rate": 1.902222076914869e-05,
+      "loss": 0.6209,
+      "step": 2016
+    },
+    {
+      "epoch": 0.8068,
+      "grad_norm": 0.3398542244163301,
+      "learning_rate": 1.894627688286571e-05,
+      "loss": 0.5329,
+      "step": 2017
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.3534623363527577,
+      "learning_rate": 1.8870469031114868e-05,
+      "loss": 0.5855,
+      "step": 2018
+    },
+    {
+      "epoch": 0.8076,
+      "grad_norm": 0.34738356781161317,
+      "learning_rate": 1.8794797341126402e-05,
+      "loss": 0.6095,
+      "step": 2019
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3455584906849207,
+      "learning_rate": 1.871926193990202e-05,
+      "loss": 0.5946,
+      "step": 2020
+    },
+    {
+      "epoch": 0.8084,
+      "grad_norm": 0.3326389491789541,
+      "learning_rate": 1.8643862954214754e-05,
+      "loss": 0.559,
+      "step": 2021
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.34172098276548246,
+      "learning_rate": 1.856860051060866e-05,
+      "loss": 0.6053,
+      "step": 2022
+    },
+    {
+      "epoch": 0.8092,
+      "grad_norm": 0.3434873966275109,
+      "learning_rate": 1.8493474735398576e-05,
+      "loss": 0.5983,
+      "step": 2023
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3564931568143112,
+      "learning_rate": 1.841848575467001e-05,
+      "loss": 0.6121,
+      "step": 2024
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.32283698622671636,
+      "learning_rate": 1.8343633694278895e-05,
+      "loss": 0.5339,
+      "step": 2025
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.33907509599918256,
+      "learning_rate": 1.8268918679851388e-05,
+      "loss": 0.6126,
+      "step": 2026
+    },
+    {
+      "epoch": 0.8108,
+      "grad_norm": 0.3456536891232961,
+      "learning_rate": 1.8194340836783563e-05,
+      "loss": 0.6003,
+      "step": 2027
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.5645567074909379,
+      "learning_rate": 1.811990029024133e-05,
+      "loss": 0.6047,
+      "step": 2028
+    },
+    {
+      "epoch": 0.8116,
+      "grad_norm": 0.33826711823216155,
+      "learning_rate": 1.8045597165160133e-05,
+      "loss": 0.5925,
+      "step": 2029
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.3305265188201426,
+      "learning_rate": 1.7971431586244815e-05,
+      "loss": 0.5565,
+      "step": 2030
+    },
+    {
+      "epoch": 0.8124,
+      "grad_norm": 0.34634656094588795,
+      "learning_rate": 1.7897403677969403e-05,
+      "loss": 0.6143,
+      "step": 2031
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3570019545422078,
+      "learning_rate": 1.782351356457679e-05,
+      "loss": 0.5901,
+      "step": 2032
+    },
+    {
+      "epoch": 0.8132,
+      "grad_norm": 0.3377890457099209,
+      "learning_rate": 1.774976137007861e-05,
+      "loss": 0.5996,
+      "step": 2033
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.3563023468880287,
+      "learning_rate": 1.767614721825509e-05,
+      "loss": 0.5776,
+      "step": 2034
+    },
+    {
+      "epoch": 0.814,
+      "grad_norm": 0.33953225292164674,
+      "learning_rate": 1.7602671232654754e-05,
+      "loss": 0.5339,
+      "step": 2035
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.36826665696608535,
+      "learning_rate": 1.7529333536594215e-05,
+      "loss": 0.5819,
+      "step": 2036
+    },
+    {
+      "epoch": 0.8148,
+      "grad_norm": 0.3323992173322127,
+      "learning_rate": 1.7456134253157975e-05,
+      "loss": 0.5531,
+      "step": 2037
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.3344180779819317,
+      "learning_rate": 1.7383073505198255e-05,
+      "loss": 0.6052,
+      "step": 2038
+    },
+    {
+      "epoch": 0.8156,
+      "grad_norm": 0.34888494102531564,
+      "learning_rate": 1.7310151415334798e-05,
+      "loss": 0.5617,
+      "step": 2039
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3502559122482306,
+      "learning_rate": 1.723736810595461e-05,
+      "loss": 0.5941,
+      "step": 2040
+    },
+    {
+      "epoch": 0.8164,
+      "grad_norm": 0.7209477016353654,
+      "learning_rate": 1.716472369921178e-05,
+      "loss": 0.5645,
+      "step": 2041
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.33700584654711524,
+      "learning_rate": 1.7092218317027232e-05,
+      "loss": 0.5765,
+      "step": 2042
+    },
+    {
+      "epoch": 0.8172,
+      "grad_norm": 0.3598343415422159,
+      "learning_rate": 1.7019852081088617e-05,
+      "loss": 0.6089,
+      "step": 2043
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.33946299005586367,
+      "learning_rate": 1.6947625112850073e-05,
+      "loss": 0.587,
+      "step": 2044
+    },
+    {
+      "epoch": 0.818,
+      "grad_norm": 0.5583069211823942,
+      "learning_rate": 1.6875537533531948e-05,
+      "loss": 0.5511,
+      "step": 2045
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.366562389586145,
+      "learning_rate": 1.680358946412064e-05,
+      "loss": 0.5958,
+      "step": 2046
+    },
+    {
+      "epoch": 0.8188,
+      "grad_norm": 0.33455470150572314,
+      "learning_rate": 1.673178102536842e-05,
+      "loss": 0.612,
+      "step": 2047
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3686209541133758,
+      "learning_rate": 1.6660112337793256e-05,
+      "loss": 0.5552,
+      "step": 2048
+    },
+    {
+      "epoch": 0.8196,
+      "grad_norm": 0.35436738074759055,
+      "learning_rate": 1.6588583521678535e-05,
+      "loss": 0.5935,
+      "step": 2049
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.3447122268173417,
+      "learning_rate": 1.65171946970729e-05,
+      "loss": 0.5845,
+      "step": 2050
+    },
+    {
+      "epoch": 0.8204,
+      "grad_norm": 0.36429045978656727,
+      "learning_rate": 1.644594598378999e-05,
+      "loss": 0.6181,
+      "step": 2051
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.38445558327193396,
+      "learning_rate": 1.6374837501408403e-05,
+      "loss": 0.5828,
+      "step": 2052
+    },
+    {
+      "epoch": 0.8212,
+      "grad_norm": 0.3926817490647689,
+      "learning_rate": 1.6303869369271264e-05,
+      "loss": 0.5558,
+      "step": 2053
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.36464265465604595,
+      "learning_rate": 1.623304170648625e-05,
+      "loss": 0.5766,
+      "step": 2054
+    },
+    {
+      "epoch": 0.822,
+      "grad_norm": 0.3433207753429021,
+      "learning_rate": 1.6162354631925204e-05,
+      "loss": 0.5556,
+      "step": 2055
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3385924963633419,
+      "learning_rate": 1.609180826422404e-05,
+      "loss": 0.5877,
+      "step": 2056
+    },
+    {
+      "epoch": 0.8228,
+      "grad_norm": 0.38593124490029423,
+      "learning_rate": 1.6021402721782532e-05,
+      "loss": 0.5923,
+      "step": 2057
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.35482846986110306,
+      "learning_rate": 1.5951138122764132e-05,
+      "loss": 0.5648,
+      "step": 2058
+    },
+    {
+      "epoch": 0.8236,
+      "grad_norm": 0.3445861470215919,
+      "learning_rate": 1.58810145850957e-05,
+      "loss": 0.5827,
+      "step": 2059
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3472389735635199,
+      "learning_rate": 1.5811032226467305e-05,
+      "loss": 0.5944,
+      "step": 2060
+    },
+    {
+      "epoch": 0.8244,
+      "grad_norm": 0.3188110832300483,
+      "learning_rate": 1.574119116433219e-05,
+      "loss": 0.6018,
+      "step": 2061
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.3293400635332867,
+      "learning_rate": 1.5671491515906355e-05,
+      "loss": 0.5512,
+      "step": 2062
+    },
+    {
+      "epoch": 0.8252,
+      "grad_norm": 0.4845587784638902,
+      "learning_rate": 1.5601933398168522e-05,
+      "loss": 0.5875,
+      "step": 2063
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3808356167565948,
+      "learning_rate": 1.553251692785985e-05,
+      "loss": 0.5855,
+      "step": 2064
+    },
+    {
+      "epoch": 0.826,
+      "grad_norm": 0.3582145981743341,
+      "learning_rate": 1.5463242221483743e-05,
+      "loss": 0.608,
+      "step": 2065
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.341944239074536,
+      "learning_rate": 1.5394109395305757e-05,
+      "loss": 0.5831,
+      "step": 2066
+    },
+    {
+      "epoch": 0.8268,
+      "grad_norm": 0.35544254885132803,
+      "learning_rate": 1.5325118565353234e-05,
+      "loss": 0.6136,
+      "step": 2067
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.31741929242365463,
+      "learning_rate": 1.5256269847415283e-05,
+      "loss": 0.529,
+      "step": 2068
+    },
+    {
+      "epoch": 0.8276,
+      "grad_norm": 0.3577898381015023,
+      "learning_rate": 1.5187563357042423e-05,
+      "loss": 0.6031,
+      "step": 2069
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.4909330665553209,
+      "learning_rate": 1.5118999209546559e-05,
+      "loss": 0.5942,
+      "step": 2070
+    },
+    {
+      "epoch": 0.8284,
+      "grad_norm": 0.3459967010536719,
+      "learning_rate": 1.5050577520000607e-05,
+      "loss": 0.5861,
+      "step": 2071
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3729678607753999,
+      "learning_rate": 1.4982298403238471e-05,
+      "loss": 0.594,
+      "step": 2072
+    },
+    {
+      "epoch": 0.8292,
+      "grad_norm": 0.4090867702653392,
+      "learning_rate": 1.4914161973854712e-05,
+      "loss": 0.6108,
+      "step": 2073
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.3379008513631914,
+      "learning_rate": 1.4846168346204425e-05,
+      "loss": 0.5701,
+      "step": 2074
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.3479251378314484,
+      "learning_rate": 1.4778317634403083e-05,
+      "loss": 0.5894,
+      "step": 2075
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.34575192501995333,
+      "learning_rate": 1.4710609952326238e-05,
+      "loss": 0.5834,
+      "step": 2076
+    },
+    {
+      "epoch": 0.8308,
+      "grad_norm": 0.35259532439471253,
+      "learning_rate": 1.4643045413609458e-05,
+      "loss": 0.5619,
+      "step": 2077
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.36088566864808475,
+      "learning_rate": 1.457562413164799e-05,
+      "loss": 0.6035,
+      "step": 2078
+    },
+    {
+      "epoch": 0.8316,
+      "grad_norm": 0.3619762257003658,
+      "learning_rate": 1.4508346219596724e-05,
+      "loss": 0.6257,
+      "step": 2079
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.33927954550870054,
+      "learning_rate": 1.444121179036989e-05,
+      "loss": 0.5573,
+      "step": 2080
+    },
+    {
+      "epoch": 0.8324,
+      "grad_norm": 0.6645996870320319,
+      "learning_rate": 1.4374220956640893e-05,
+      "loss": 0.6153,
+      "step": 2081
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.3429083338924902,
+      "learning_rate": 1.4307373830842174e-05,
+      "loss": 0.5926,
+      "step": 2082
+    },
+    {
+      "epoch": 0.8332,
+      "grad_norm": 0.3487607768016974,
+      "learning_rate": 1.424067052516499e-05,
+      "loss": 0.5658,
+      "step": 2083
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3360185981609451,
+      "learning_rate": 1.4174111151559189e-05,
+      "loss": 0.571,
+      "step": 2084
+    },
+    {
+      "epoch": 0.834,
+      "grad_norm": 0.3808572645249857,
+      "learning_rate": 1.4107695821733025e-05,
+      "loss": 0.5747,
+      "step": 2085
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.3592965446953947,
+      "learning_rate": 1.4041424647153112e-05,
+      "loss": 0.6209,
+      "step": 2086
+    },
+    {
+      "epoch": 0.8348,
+      "grad_norm": 0.336451723889464,
+      "learning_rate": 1.3975297739043992e-05,
+      "loss": 0.5813,
+      "step": 2087
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3452593481096945,
+      "learning_rate": 1.3909315208388184e-05,
+      "loss": 0.6158,
+      "step": 2088
+    },
+    {
+      "epoch": 0.8356,
+      "grad_norm": 0.35079055441792556,
+      "learning_rate": 1.3843477165925844e-05,
+      "loss": 0.5859,
+      "step": 2089
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.3252303011864,
+      "learning_rate": 1.3777783722154603e-05,
+      "loss": 0.5465,
+      "step": 2090
+    },
+    {
+      "epoch": 0.8364,
+      "grad_norm": 0.3316008207825591,
+      "learning_rate": 1.3712234987329486e-05,
+      "loss": 0.5683,
+      "step": 2091
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3426650526067097,
+      "learning_rate": 1.3646831071462607e-05,
+      "loss": 0.5636,
+      "step": 2092
+    },
+    {
+      "epoch": 0.8372,
+      "grad_norm": 0.34348157223588005,
+      "learning_rate": 1.3581572084323013e-05,
+      "loss": 0.5896,
+      "step": 2093
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.33426418782158246,
+      "learning_rate": 1.3516458135436538e-05,
+      "loss": 0.5741,
+      "step": 2094
+    },
+    {
+      "epoch": 0.838,
+      "grad_norm": 0.5450857217741052,
+      "learning_rate": 1.3451489334085554e-05,
+      "loss": 0.6091,
+      "step": 2095
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.37112334679589354,
+      "learning_rate": 1.3386665789308883e-05,
+      "loss": 0.5957,
+      "step": 2096
+    },
+    {
+      "epoch": 0.8388,
+      "grad_norm": 0.3650657952317654,
+      "learning_rate": 1.3321987609901554e-05,
+      "loss": 0.5804,
+      "step": 2097
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.3436873179991808,
+      "learning_rate": 1.325745490441458e-05,
+      "loss": 0.5974,
+      "step": 2098
+    },
+    {
+      "epoch": 0.8396,
+      "grad_norm": 0.3376329269121168,
+      "learning_rate": 1.3193067781154833e-05,
+      "loss": 0.5341,
+      "step": 2099
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.35743497857189027,
+      "learning_rate": 1.3128826348184887e-05,
+      "loss": 0.5931,
+      "step": 2100
+    },
+    {
+      "epoch": 0.8404,
+      "grad_norm": 0.35691555443942813,
+      "learning_rate": 1.3064730713322792e-05,
+      "loss": 0.564,
+      "step": 2101
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.3488535101750259,
+      "learning_rate": 1.300078098414188e-05,
+      "loss": 0.6032,
+      "step": 2102
+    },
+    {
+      "epoch": 0.8412,
+      "grad_norm": 0.3172197040457403,
+      "learning_rate": 1.2936977267970596e-05,
+      "loss": 0.5563,
+      "step": 2103
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3306876381261289,
+      "learning_rate": 1.2873319671892337e-05,
+      "loss": 0.5758,
+      "step": 2104
+    },
+    {
+      "epoch": 0.842,
+      "grad_norm": 0.3369166000365717,
+      "learning_rate": 1.2809808302745297e-05,
+      "loss": 0.5406,
+      "step": 2105
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.35732401228280025,
+      "learning_rate": 1.2746443267122233e-05,
+      "loss": 0.572,
+      "step": 2106
+    },
+    {
+      "epoch": 0.8428,
+      "grad_norm": 0.3599015927677342,
+      "learning_rate": 1.2683224671370286e-05,
+      "loss": 0.5777,
+      "step": 2107
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.35448239242111973,
+      "learning_rate": 1.2620152621590819e-05,
+      "loss": 0.5589,
+      "step": 2108
+    },
+    {
+      "epoch": 0.8436,
+      "grad_norm": 0.3365683844191083,
+      "learning_rate": 1.255722722363929e-05,
+      "loss": 0.5256,
+      "step": 2109
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.36195666986890024,
+      "learning_rate": 1.2494448583125018e-05,
+      "loss": 0.604,
+      "step": 2110
+    },
+    {
+      "epoch": 0.8444,
+      "grad_norm": 0.34492524171948424,
+      "learning_rate": 1.2431816805410967e-05,
+      "loss": 0.5719,
+      "step": 2111
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.33899530490449875,
+      "learning_rate": 1.2369331995613665e-05,
+      "loss": 0.5785,
+      "step": 2112
+    },
+    {
+      "epoch": 0.8452,
+      "grad_norm": 0.3483370447155762,
+      "learning_rate": 1.2306994258602922e-05,
+      "loss": 0.5758,
+      "step": 2113
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.37141725085210964,
+      "learning_rate": 1.2244803699001783e-05,
+      "loss": 0.6501,
+      "step": 2114
+    },
+    {
+      "epoch": 0.846,
+      "grad_norm": 0.32631894828415153,
+      "learning_rate": 1.218276042118629e-05,
+      "loss": 0.6012,
+      "step": 2115
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3443753329616641,
+      "learning_rate": 1.2120864529285203e-05,
+      "loss": 0.5654,
+      "step": 2116
+    },
+    {
+      "epoch": 0.8468,
+      "grad_norm": 0.32433973224085527,
+      "learning_rate": 1.2059116127179993e-05,
+      "loss": 0.5942,
+      "step": 2117
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.34710544152147893,
+      "learning_rate": 1.199751531850457e-05,
+      "loss": 0.5826,
+      "step": 2118
+    },
+    {
+      "epoch": 0.8476,
+      "grad_norm": 0.38471513154291287,
+      "learning_rate": 1.1936062206645182e-05,
+      "loss": 0.551,
+      "step": 2119
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.33527920319158955,
+      "learning_rate": 1.1874756894740135e-05,
+      "loss": 0.5181,
+      "step": 2120
+    },
+    {
+      "epoch": 0.8484,
+      "grad_norm": 0.3398984166096767,
+      "learning_rate": 1.1813599485679683e-05,
+      "loss": 0.5839,
+      "step": 2121
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.3373449070753534,
+      "learning_rate": 1.1752590082105864e-05,
+      "loss": 0.5607,
+      "step": 2122
+    },
+    {
+      "epoch": 0.8492,
+      "grad_norm": 0.34089897512678086,
+      "learning_rate": 1.1691728786412316e-05,
+      "loss": 0.5652,
+      "step": 2123
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.35003168205330665,
+      "learning_rate": 1.1631015700744152e-05,
+      "loss": 0.5889,
+      "step": 2124
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.3568026042940518,
+      "learning_rate": 1.1570450926997655e-05,
+      "loss": 0.6093,
+      "step": 2125
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.3827902928256842,
+      "learning_rate": 1.1510034566820204e-05,
+      "loss": 0.5776,
+      "step": 2126
+    },
+    {
+      "epoch": 0.8508,
+      "grad_norm": 0.3642162679937661,
+      "learning_rate": 1.1449766721610189e-05,
+      "loss": 0.6023,
+      "step": 2127
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.36057336322997025,
+      "learning_rate": 1.1389647492516598e-05,
+      "loss": 0.5186,
+      "step": 2128
+    },
+    {
+      "epoch": 0.8516,
+      "grad_norm": 0.3636241408296972,
+      "learning_rate": 1.132967698043913e-05,
+      "loss": 0.6217,
+      "step": 2129
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.3317373651782228,
+      "learning_rate": 1.1269855286027797e-05,
+      "loss": 0.5544,
+      "step": 2130
+    },
+    {
+      "epoch": 0.8524,
+      "grad_norm": 0.33341520477991027,
+      "learning_rate": 1.1210182509682854e-05,
+      "loss": 0.6192,
+      "step": 2131
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3412776256292698,
+      "learning_rate": 1.1150658751554665e-05,
+      "loss": 0.5709,
+      "step": 2132
+    },
+    {
+      "epoch": 0.8532,
+      "grad_norm": 0.3357547206020433,
+      "learning_rate": 1.1091284111543498e-05,
+      "loss": 0.5858,
+      "step": 2133
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.36459156646194935,
+      "learning_rate": 1.1032058689299296e-05,
+      "loss": 0.6304,
+      "step": 2134
+    },
+    {
+      "epoch": 0.854,
+      "grad_norm": 0.3287866467689789,
+      "learning_rate": 1.0972982584221592e-05,
+      "loss": 0.5963,
+      "step": 2135
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.33925962870122844,
+      "learning_rate": 1.0914055895459352e-05,
+      "loss": 0.5686,
+      "step": 2136
+    },
+    {
+      "epoch": 0.8548,
+      "grad_norm": 0.387450059725076,
+      "learning_rate": 1.08552787219107e-05,
+      "loss": 0.5654,
+      "step": 2137
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.3437873979196186,
+      "learning_rate": 1.0796651162222915e-05,
+      "loss": 0.5904,
+      "step": 2138
+    },
+    {
+      "epoch": 0.8556,
+      "grad_norm": 0.35215658105124237,
+      "learning_rate": 1.07381733147921e-05,
+      "loss": 0.5938,
+      "step": 2139
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3411921172604908,
+      "learning_rate": 1.067984527776309e-05,
+      "loss": 0.5878,
+      "step": 2140
+    },
+    {
+      "epoch": 0.8564,
+      "grad_norm": 0.33376281233262733,
+      "learning_rate": 1.0621667149029379e-05,
+      "loss": 0.619,
+      "step": 2141
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.33852640238096193,
+      "learning_rate": 1.056363902623274e-05,
+      "loss": 0.5891,
+      "step": 2142
+    },
+    {
+      "epoch": 0.8572,
+      "grad_norm": 0.34858034692372686,
+      "learning_rate": 1.0505761006763314e-05,
+      "loss": 0.594,
+      "step": 2143
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.34536636726515607,
+      "learning_rate": 1.0448033187759221e-05,
+      "loss": 0.6005,
+      "step": 2144
+    },
+    {
+      "epoch": 0.858,
+      "grad_norm": 0.35874099678350707,
+      "learning_rate": 1.0390455666106547e-05,
+      "loss": 0.5819,
+      "step": 2145
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.33635170634072625,
+      "learning_rate": 1.0333028538439094e-05,
+      "loss": 0.5632,
+      "step": 2146
+    },
+    {
+      "epoch": 0.8588,
+      "grad_norm": 0.33136983679728427,
+      "learning_rate": 1.027575190113832e-05,
+      "loss": 0.5834,
+      "step": 2147
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.35258195045669577,
+      "learning_rate": 1.0218625850333041e-05,
+      "loss": 0.5615,
+      "step": 2148
+    },
+    {
+      "epoch": 0.8596,
+      "grad_norm": 0.3430590005549731,
+      "learning_rate": 1.0161650481899342e-05,
+      "loss": 0.5392,
+      "step": 2149
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.347786574887435,
+      "learning_rate": 1.010482589146048e-05,
+      "loss": 0.617,
+      "step": 2150
+    },
+    {
+      "epoch": 0.8604,
+      "grad_norm": 0.32974970272121834,
+      "learning_rate": 1.0048152174386583e-05,
+      "loss": 0.5241,
+      "step": 2151
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3327895781603911,
+      "learning_rate": 9.991629425794623e-06,
+      "loss": 0.5575,
+      "step": 2152
+    },
+    {
+      "epoch": 0.8612,
+      "grad_norm": 0.3466328037932508,
+      "learning_rate": 9.935257740548143e-06,
+      "loss": 0.6101,
+      "step": 2153
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.39210566626994137,
+      "learning_rate": 9.879037213257213e-06,
+      "loss": 0.5923,
+      "step": 2154
+    },
+    {
+      "epoch": 0.862,
+      "grad_norm": 0.3250064069208217,
+      "learning_rate": 9.822967938278171e-06,
+      "loss": 0.526,
+      "step": 2155
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.33374959570321466,
+      "learning_rate": 9.767050009713474e-06,
+      "loss": 0.5672,
+      "step": 2156
+    },
+    {
+      "epoch": 0.8628,
+      "grad_norm": 0.33645634186278234,
+      "learning_rate": 9.711283521411674e-06,
+      "loss": 0.5337,
+      "step": 2157
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.40772098857340405,
+      "learning_rate": 9.655668566967025e-06,
+      "loss": 0.5643,
+      "step": 2158
+    },
+    {
+      "epoch": 0.8636,
+      "grad_norm": 0.34107492104769427,
+      "learning_rate": 9.600205239719584e-06,
+      "loss": 0.61,
+      "step": 2159
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.38363834491880583,
+      "learning_rate": 9.544893632754814e-06,
+      "loss": 0.5668,
+      "step": 2160
+    },
+    {
+      "epoch": 0.8644,
+      "grad_norm": 0.3357640280010078,
+      "learning_rate": 9.489733838903647e-06,
+      "loss": 0.5896,
+      "step": 2161
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.346235620539173,
+      "learning_rate": 9.434725950742118e-06,
+      "loss": 0.616,
+      "step": 2162
+    },
+    {
+      "epoch": 0.8652,
+      "grad_norm": 0.33011483628748084,
+      "learning_rate": 9.379870060591434e-06,
+      "loss": 0.5937,
+      "step": 2163
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.34631395790079156,
+      "learning_rate": 9.325166260517592e-06,
+      "loss": 0.5647,
+      "step": 2164
+    },
+    {
+      "epoch": 0.866,
+      "grad_norm": 0.32898803954272465,
+      "learning_rate": 9.270614642331376e-06,
+      "loss": 0.5698,
+      "step": 2165
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.3464200472980868,
+      "learning_rate": 9.216215297588183e-06,
+      "loss": 0.576,
+      "step": 2166
+    },
+    {
+      "epoch": 0.8668,
+      "grad_norm": 0.343889344991586,
+      "learning_rate": 9.161968317587787e-06,
+      "loss": 0.6337,
+      "step": 2167
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.33164638899696103,
+      "learning_rate": 9.107873793374322e-06,
+      "loss": 0.6191,
+      "step": 2168
+    },
+    {
+      "epoch": 0.8676,
+      "grad_norm": 0.3574478499833753,
+      "learning_rate": 9.053931815735994e-06,
+      "loss": 0.6558,
+      "step": 2169
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.36453568461488134,
+      "learning_rate": 9.000142475204964e-06,
+      "loss": 0.5521,
+      "step": 2170
+    },
+    {
+      "epoch": 0.8684,
+      "grad_norm": 0.38408131723261274,
+      "learning_rate": 8.946505862057286e-06,
+      "loss": 0.6281,
+      "step": 2171
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3601526494169314,
+      "learning_rate": 8.893022066312672e-06,
+      "loss": 0.5922,
+      "step": 2172
+    },
+    {
+      "epoch": 0.8692,
+      "grad_norm": 0.34482270087578476,
+      "learning_rate": 8.839691177734322e-06,
+      "loss": 0.6064,
+      "step": 2173
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.32648666829842754,
+      "learning_rate": 8.786513285828834e-06,
+      "loss": 0.5437,
+      "step": 2174
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.3936583852917972,
+      "learning_rate": 8.733488479845997e-06,
+      "loss": 0.6111,
+      "step": 2175
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3175571242137894,
+      "learning_rate": 8.680616848778711e-06,
+      "loss": 0.5332,
+      "step": 2176
+    },
+    {
+      "epoch": 0.8708,
+      "grad_norm": 0.3686518734142378,
+      "learning_rate": 8.627898481362817e-06,
+      "loss": 0.6331,
+      "step": 2177
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.3539160746294602,
+      "learning_rate": 8.575333466076863e-06,
+      "loss": 0.6253,
+      "step": 2178
+    },
+    {
+      "epoch": 0.8716,
+      "grad_norm": 0.32778557025960625,
+      "learning_rate": 8.522921891142032e-06,
+      "loss": 0.553,
+      "step": 2179
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3485023348541899,
+      "learning_rate": 8.470663844522052e-06,
+      "loss": 0.5814,
+      "step": 2180
+    },
+    {
+      "epoch": 0.8724,
+      "grad_norm": 0.3310926997935631,
+      "learning_rate": 8.418559413922933e-06,
+      "loss": 0.5582,
+      "step": 2181
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.37437652214535044,
+      "learning_rate": 8.366608686792854e-06,
+      "loss": 0.5453,
+      "step": 2182
+    },
+    {
+      "epoch": 0.8732,
+      "grad_norm": 0.3481714449337977,
+      "learning_rate": 8.31481175032206e-06,
+      "loss": 0.5731,
+      "step": 2183
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3446384338979933,
+      "learning_rate": 8.263168691442624e-06,
+      "loss": 0.6235,
+      "step": 2184
+    },
+    {
+      "epoch": 0.874,
+      "grad_norm": 0.35092100052470027,
+      "learning_rate": 8.21167959682848e-06,
+      "loss": 0.5939,
+      "step": 2185
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.33976686257110345,
+      "learning_rate": 8.16034455289506e-06,
+      "loss": 0.5432,
+      "step": 2186
+    },
+    {
+      "epoch": 0.8748,
+      "grad_norm": 0.3424977560024411,
+      "learning_rate": 8.109163645799267e-06,
+      "loss": 0.5625,
+      "step": 2187
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3259931498126824,
+      "learning_rate": 8.058136961439333e-06,
+      "loss": 0.5553,
+      "step": 2188
+    },
+    {
+      "epoch": 0.8756,
+      "grad_norm": 0.329605914429993,
+      "learning_rate": 8.007264585454633e-06,
+      "loss": 0.5597,
+      "step": 2189
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.3570522607877131,
+      "learning_rate": 7.956546603225601e-06,
+      "loss": 0.5992,
+      "step": 2190
+    },
+    {
+      "epoch": 0.8764,
+      "grad_norm": 0.37566719111983116,
+      "learning_rate": 7.905983099873504e-06,
+      "loss": 0.5553,
+      "step": 2191
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3854131661562234,
+      "learning_rate": 7.85557416026037e-06,
+      "loss": 0.546,
+      "step": 2192
+    },
+    {
+      "epoch": 0.8772,
+      "grad_norm": 0.3247006170241847,
+      "learning_rate": 7.805319868988758e-06,
+      "loss": 0.5784,
+      "step": 2193
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.36393035734443796,
+      "learning_rate": 7.755220310401811e-06,
+      "loss": 0.6415,
+      "step": 2194
+    },
+    {
+      "epoch": 0.878,
+      "grad_norm": 0.3430330211948053,
+      "learning_rate": 7.705275568582848e-06,
+      "loss": 0.5825,
+      "step": 2195
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3364752895914115,
+      "learning_rate": 7.655485727355415e-06,
+      "loss": 0.6064,
+      "step": 2196
+    },
+    {
+      "epoch": 0.8788,
+      "grad_norm": 0.32887588473018375,
+      "learning_rate": 7.605850870283049e-06,
+      "loss": 0.6015,
+      "step": 2197
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.37339551678069766,
+      "learning_rate": 7.556371080669222e-06,
+      "loss": 0.5891,
+      "step": 2198
+    },
+    {
+      "epoch": 0.8796,
+      "grad_norm": 0.3412578827035179,
+      "learning_rate": 7.5070464415571415e-06,
+      "loss": 0.6025,
+      "step": 2199
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3176814606146469,
+      "learning_rate": 7.457877035729588e-06,
+      "loss": 0.546,
+      "step": 2200
+    },
+    {
+      "epoch": 0.8804,
+      "grad_norm": 0.35333484894842804,
+      "learning_rate": 7.408862945708839e-06,
+      "loss": 0.5666,
+      "step": 2201
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.34067618744853434,
+      "learning_rate": 7.360004253756459e-06,
+      "loss": 0.5558,
+      "step": 2202
+    },
+    {
+      "epoch": 0.8812,
+      "grad_norm": 0.3493373501733574,
+      "learning_rate": 7.311301041873275e-06,
+      "loss": 0.5826,
+      "step": 2203
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.3470043637665803,
+      "learning_rate": 7.262753391799127e-06,
+      "loss": 0.5155,
+      "step": 2204
+    },
+    {
+      "epoch": 0.882,
+      "grad_norm": 0.3557963350107681,
+      "learning_rate": 7.21436138501278e-06,
+      "loss": 0.5693,
+      "step": 2205
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.4127331932220236,
+      "learning_rate": 7.166125102731735e-06,
+      "loss": 0.6331,
+      "step": 2206
+    },
+    {
+      "epoch": 0.8828,
+      "grad_norm": 0.34864615914110403,
+      "learning_rate": 7.118044625912213e-06,
+      "loss": 0.5679,
+      "step": 2207
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3180934168730491,
+      "learning_rate": 7.070120035248906e-06,
+      "loss": 0.5818,
+      "step": 2208
+    },
+    {
+      "epoch": 0.8836,
+      "grad_norm": 0.3678221847733007,
+      "learning_rate": 7.022351411174866e-06,
+      "loss": 0.6132,
+      "step": 2209
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.3802873506675512,
+      "learning_rate": 6.974738833861383e-06,
+      "loss": 0.5353,
+      "step": 2210
+    },
+    {
+      "epoch": 0.8844,
+      "grad_norm": 0.3893805654449064,
+      "learning_rate": 6.927282383217892e-06,
+      "loss": 0.566,
+      "step": 2211
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.33747086502280854,
+      "learning_rate": 6.879982138891716e-06,
+      "loss": 0.5934,
+      "step": 2212
+    },
+    {
+      "epoch": 0.8852,
+      "grad_norm": 0.34829980188505943,
+      "learning_rate": 6.83283818026812e-06,
+      "loss": 0.5513,
+      "step": 2213
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.33545995370726744,
+      "learning_rate": 6.785850586469989e-06,
+      "loss": 0.5382,
+      "step": 2214
+    },
+    {
+      "epoch": 0.886,
+      "grad_norm": 0.3686607696228201,
+      "learning_rate": 6.739019436357774e-06,
+      "loss": 0.545,
+      "step": 2215
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3360522782306863,
+      "learning_rate": 6.692344808529427e-06,
+      "loss": 0.5816,
+      "step": 2216
+    },
+    {
+      "epoch": 0.8868,
+      "grad_norm": 0.3450348296150317,
+      "learning_rate": 6.645826781320142e-06,
+      "loss": 0.6355,
+      "step": 2217
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.32055865848150106,
+      "learning_rate": 6.599465432802332e-06,
+      "loss": 0.6024,
+      "step": 2218
+    },
+    {
+      "epoch": 0.8876,
+      "grad_norm": 0.34997373998894216,
+      "learning_rate": 6.553260840785414e-06,
+      "loss": 0.5816,
+      "step": 2219
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.32760059318056384,
+      "learning_rate": 6.507213082815744e-06,
+      "loss": 0.5817,
+      "step": 2220
+    },
+    {
+      "epoch": 0.8884,
+      "grad_norm": 0.3242711784203104,
+      "learning_rate": 6.461322236176437e-06,
+      "loss": 0.5533,
+      "step": 2221
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.3316895678247226,
+      "learning_rate": 6.415588377887305e-06,
+      "loss": 0.5733,
+      "step": 2222
+    },
+    {
+      "epoch": 0.8892,
+      "grad_norm": 0.3450565066703003,
+      "learning_rate": 6.370011584704616e-06,
+      "loss": 0.5729,
+      "step": 2223
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.34878316171392754,
+      "learning_rate": 6.324591933121071e-06,
+      "loss": 0.6155,
+      "step": 2224
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.3606929169768857,
+      "learning_rate": 6.2793294993656494e-06,
+      "loss": 0.629,
+      "step": 2225
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.35038468644965637,
+      "learning_rate": 6.2342243594034066e-06,
+      "loss": 0.5781,
+      "step": 2226
+    },
+    {
+      "epoch": 0.8908,
+      "grad_norm": 0.3490618098903304,
+      "learning_rate": 6.1892765889355e-06,
+      "loss": 0.5881,
+      "step": 2227
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3414333206005217,
+      "learning_rate": 6.144486263398886e-06,
+      "loss": 0.6083,
+      "step": 2228
+    },
+    {
+      "epoch": 0.8916,
+      "grad_norm": 0.3346368217085105,
+      "learning_rate": 6.0998534579663425e-06,
+      "loss": 0.573,
+      "step": 2229
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.33617848103996995,
+      "learning_rate": 6.055378247546218e-06,
+      "loss": 0.6062,
+      "step": 2230
+    },
+    {
+      "epoch": 0.8924,
+      "grad_norm": 0.35135871815177266,
+      "learning_rate": 6.01106070678239e-06,
+      "loss": 0.6165,
+      "step": 2231
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3351654589423389,
+      "learning_rate": 5.96690091005414e-06,
+      "loss": 0.6054,
+      "step": 2232
+    },
+    {
+      "epoch": 0.8932,
+      "grad_norm": 0.33937639291603283,
+      "learning_rate": 5.922898931475973e-06,
+      "loss": 0.5504,
+      "step": 2233
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.37166808012426555,
+      "learning_rate": 5.879054844897536e-06,
+      "loss": 0.5873,
+      "step": 2234
+    },
+    {
+      "epoch": 0.894,
+      "grad_norm": 0.34574335975587034,
+      "learning_rate": 5.835368723903456e-06,
+      "loss": 0.626,
+      "step": 2235
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.345540554802604,
+      "learning_rate": 5.791840641813295e-06,
+      "loss": 0.5567,
+      "step": 2236
+    },
+    {
+      "epoch": 0.8948,
+      "grad_norm": 0.36032088988088107,
+      "learning_rate": 5.748470671681327e-06,
+      "loss": 0.6215,
+      "step": 2237
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.35939823610080457,
+      "learning_rate": 5.705258886296494e-06,
+      "loss": 0.5905,
+      "step": 2238
+    },
+    {
+      "epoch": 0.8956,
+      "grad_norm": 0.3908783146108529,
+      "learning_rate": 5.662205358182226e-06,
+      "loss": 0.5593,
+      "step": 2239
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.35664593571911507,
+      "learning_rate": 5.6193101595963585e-06,
+      "loss": 0.5696,
+      "step": 2240
+    },
+    {
+      "epoch": 0.8964,
+      "grad_norm": 0.33399971612249935,
+      "learning_rate": 5.576573362531001e-06,
+      "loss": 0.5926,
+      "step": 2241
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.35303598976057604,
+      "learning_rate": 5.533995038712403e-06,
+      "loss": 0.5655,
+      "step": 2242
+    },
+    {
+      "epoch": 0.8972,
+      "grad_norm": 0.34392994527527554,
+      "learning_rate": 5.491575259600879e-06,
+      "loss": 0.6274,
+      "step": 2243
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.3531636604039412,
+      "learning_rate": 5.449314096390601e-06,
+      "loss": 0.5736,
+      "step": 2244
+    },
+    {
+      "epoch": 0.898,
+      "grad_norm": 0.3440271547583296,
+      "learning_rate": 5.407211620009544e-06,
+      "loss": 0.5564,
+      "step": 2245
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.35193330728329647,
+      "learning_rate": 5.365267901119397e-06,
+      "loss": 0.5596,
+      "step": 2246
+    },
+    {
+      "epoch": 0.8988,
+      "grad_norm": 0.3794083771602135,
+      "learning_rate": 5.323483010115382e-06,
+      "loss": 0.6279,
+      "step": 2247
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3441663137581851,
+      "learning_rate": 5.281857017126124e-06,
+      "loss": 0.6086,
+      "step": 2248
+    },
+    {
+      "epoch": 0.8996,
+      "grad_norm": 0.35205692114369125,
+      "learning_rate": 5.240389992013606e-06,
+      "loss": 0.5958,
+      "step": 2249
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.3255215354488011,
+      "learning_rate": 5.199082004372957e-06,
+      "loss": 0.5505,
+      "step": 2250
+    },
+    {
+      "epoch": 0.9004,
+      "grad_norm": 0.33028656101732806,
+      "learning_rate": 5.157933123532465e-06,
+      "loss": 0.5662,
+      "step": 2251
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.33655710024673235,
+      "learning_rate": 5.116943418553355e-06,
+      "loss": 0.5752,
+      "step": 2252
+    },
+    {
+      "epoch": 0.9012,
+      "grad_norm": 0.3466324797514499,
+      "learning_rate": 5.076112958229673e-06,
+      "loss": 0.5663,
+      "step": 2253
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.34687514423799,
+      "learning_rate": 5.035441811088204e-06,
+      "loss": 0.6246,
+      "step": 2254
+    },
+    {
+      "epoch": 0.902,
+      "grad_norm": 0.36854360123541985,
+      "learning_rate": 4.994930045388413e-06,
+      "loss": 0.6255,
+      "step": 2255
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3475971068131044,
+      "learning_rate": 4.9545777291222116e-06,
+      "loss": 0.624,
+      "step": 2256
+    },
+    {
+      "epoch": 0.9028,
+      "grad_norm": 0.3484591062388061,
+      "learning_rate": 4.914384930013927e-06,
+      "loss": 0.6347,
+      "step": 2257
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.35781500647827147,
+      "learning_rate": 4.874351715520153e-06,
+      "loss": 0.6228,
+      "step": 2258
+    },
+    {
+      "epoch": 0.9036,
+      "grad_norm": 0.33479370202552666,
+      "learning_rate": 4.834478152829658e-06,
+      "loss": 0.5395,
+      "step": 2259
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3141925719904852,
+      "learning_rate": 4.794764308863242e-06,
+      "loss": 0.5663,
+      "step": 2260
+    },
+    {
+      "epoch": 0.9044,
+      "grad_norm": 0.33925843606100425,
+      "learning_rate": 4.7552102502737e-06,
+      "loss": 0.5878,
+      "step": 2261
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.31770909911930123,
+      "learning_rate": 4.715816043445609e-06,
+      "loss": 0.5423,
+      "step": 2262
+    },
+    {
+      "epoch": 0.9052,
+      "grad_norm": 0.33756385648246795,
+      "learning_rate": 4.676581754495235e-06,
+      "loss": 0.5288,
+      "step": 2263
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.33754218398571206,
+      "learning_rate": 4.637507449270517e-06,
+      "loss": 0.5738,
+      "step": 2264
+    },
+    {
+      "epoch": 0.906,
+      "grad_norm": 0.3398726259565656,
+      "learning_rate": 4.5985931933508754e-06,
+      "loss": 0.5553,
+      "step": 2265
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.3370387764338048,
+      "learning_rate": 4.559839052047066e-06,
+      "loss": 0.5934,
+      "step": 2266
+    },
+    {
+      "epoch": 0.9068,
+      "grad_norm": 0.3499969257394548,
+      "learning_rate": 4.521245090401172e-06,
+      "loss": 0.5721,
+      "step": 2267
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3545203034920037,
+      "learning_rate": 4.482811373186402e-06,
+      "loss": 0.5903,
+      "step": 2268
+    },
+    {
+      "epoch": 0.9076,
+      "grad_norm": 0.3488243483737572,
+      "learning_rate": 4.444537964907058e-06,
+      "loss": 0.5761,
+      "step": 2269
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.336752297507094,
+      "learning_rate": 4.406424929798403e-06,
+      "loss": 0.5814,
+      "step": 2270
+    },
+    {
+      "epoch": 0.9084,
+      "grad_norm": 0.3441597428004464,
+      "learning_rate": 4.368472331826478e-06,
+      "loss": 0.5953,
+      "step": 2271
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.34571888716720656,
+      "learning_rate": 4.330680234688112e-06,
+      "loss": 0.5509,
+      "step": 2272
+    },
+    {
+      "epoch": 0.9092,
+      "grad_norm": 0.347900098704324,
+      "learning_rate": 4.2930487018107424e-06,
+      "loss": 0.5934,
+      "step": 2273
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.39059031228716573,
+      "learning_rate": 4.25557779635235e-06,
+      "loss": 0.5872,
+      "step": 2274
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.3115463345804607,
+      "learning_rate": 4.2182675812012965e-06,
+      "loss": 0.5412,
+      "step": 2275
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.35340103148990887,
+      "learning_rate": 4.1811181189762685e-06,
+      "loss": 0.5759,
+      "step": 2276
+    },
+    {
+      "epoch": 0.9108,
+      "grad_norm": 0.33121808970550043,
+      "learning_rate": 4.144129472026137e-06,
+      "loss": 0.5895,
+      "step": 2277
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.36392858232423886,
+      "learning_rate": 4.107301702429922e-06,
+      "loss": 0.5693,
+      "step": 2278
+    },
+    {
+      "epoch": 0.9116,
+      "grad_norm": 0.3376782724989046,
+      "learning_rate": 4.070634871996615e-06,
+      "loss": 0.5469,
+      "step": 2279
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3342447081785353,
+      "learning_rate": 4.034129042265066e-06,
+      "loss": 0.6196,
+      "step": 2280
+    },
+    {
+      "epoch": 0.9124,
+      "grad_norm": 0.3342148208439684,
+      "learning_rate": 3.997784274503946e-06,
+      "loss": 0.5585,
+      "step": 2281
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.3522583216373266,
+      "learning_rate": 3.961600629711615e-06,
+      "loss": 0.577,
+      "step": 2282
+    },
+    {
+      "epoch": 0.9132,
+      "grad_norm": 0.3430041773663385,
+      "learning_rate": 3.925578168616006e-06,
+      "loss": 0.5813,
+      "step": 2283
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.35823019371664233,
+      "learning_rate": 3.8897169516745495e-06,
+      "loss": 0.5895,
+      "step": 2284
+    },
+    {
+      "epoch": 0.914,
+      "grad_norm": 0.32901058067219846,
+      "learning_rate": 3.854017039074009e-06,
+      "loss": 0.5724,
+      "step": 2285
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.3483576351065279,
+      "learning_rate": 3.818478490730471e-06,
+      "loss": 0.5943,
+      "step": 2286
+    },
+    {
+      "epoch": 0.9148,
+      "grad_norm": 0.3599007095841397,
+      "learning_rate": 3.783101366289199e-06,
+      "loss": 0.6038,
+      "step": 2287
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3497951907501343,
+      "learning_rate": 3.7478857251245227e-06,
+      "loss": 0.593,
+      "step": 2288
+    },
+    {
+      "epoch": 0.9156,
+      "grad_norm": 0.3302294964276083,
+      "learning_rate": 3.712831626339752e-06,
+      "loss": 0.5568,
+      "step": 2289
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.3377498089915625,
+      "learning_rate": 3.6779391287670494e-06,
+      "loss": 0.5952,
+      "step": 2290
+    },
+    {
+      "epoch": 0.9164,
+      "grad_norm": 0.36302543364294015,
+      "learning_rate": 3.643208290967415e-06,
+      "loss": 0.5769,
+      "step": 2291
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.34320098024274204,
+      "learning_rate": 3.6086391712304878e-06,
+      "loss": 0.5187,
+      "step": 2292
+    },
+    {
+      "epoch": 0.9172,
+      "grad_norm": 0.3313292168229392,
+      "learning_rate": 3.5742318275745145e-06,
+      "loss": 0.5564,
+      "step": 2293
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.32071056996032454,
+      "learning_rate": 3.5399863177462024e-06,
+      "loss": 0.5473,
+      "step": 2294
+    },
+    {
+      "epoch": 0.918,
+      "grad_norm": 0.35446421413133045,
+      "learning_rate": 3.5059026992206647e-06,
+      "loss": 0.5955,
+      "step": 2295
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.359590208707977,
+      "learning_rate": 3.471981029201321e-06,
+      "loss": 0.5935,
+      "step": 2296
+    },
+    {
+      "epoch": 0.9188,
+      "grad_norm": 0.3309916716308859,
+      "learning_rate": 3.4382213646197757e-06,
+      "loss": 0.5802,
+      "step": 2297
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.3420185321533074,
+      "learning_rate": 3.404623762135728e-06,
+      "loss": 0.5919,
+      "step": 2298
+    },
+    {
+      "epoch": 0.9196,
+      "grad_norm": 0.3511245899584105,
+      "learning_rate": 3.371188278136883e-06,
+      "loss": 0.5894,
+      "step": 2299
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.34254644203055135,
+      "learning_rate": 3.3379149687388867e-06,
+      "loss": 0.5692,
+      "step": 2300
+    },
+    {
+      "epoch": 0.9204,
+      "grad_norm": 0.358085252248771,
+      "learning_rate": 3.3048038897851573e-06,
+      "loss": 0.6141,
+      "step": 2301
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.33421788692383125,
+      "learning_rate": 3.271855096846899e-06,
+      "loss": 0.5761,
+      "step": 2302
+    },
+    {
+      "epoch": 0.9212,
+      "grad_norm": 0.3305924315223357,
+      "learning_rate": 3.239068645222898e-06,
+      "loss": 0.5717,
+      "step": 2303
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.31373356311523226,
+      "learning_rate": 3.2064445899394724e-06,
+      "loss": 0.5026,
+      "step": 2304
+    },
+    {
+      "epoch": 0.922,
+      "grad_norm": 0.3395025535682046,
+      "learning_rate": 3.1739829857504234e-06,
+      "loss": 0.5479,
+      "step": 2305
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.3355033876942166,
+      "learning_rate": 3.1416838871368924e-06,
+      "loss": 0.5423,
+      "step": 2306
+    },
+    {
+      "epoch": 0.9228,
+      "grad_norm": 0.32447934130362194,
+      "learning_rate": 3.1095473483072733e-06,
+      "loss": 0.5355,
+      "step": 2307
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.3288323850850867,
+      "learning_rate": 3.077573423197144e-06,
+      "loss": 0.6181,
+      "step": 2308
+    },
+    {
+      "epoch": 0.9236,
+      "grad_norm": 0.37778904569320426,
+      "learning_rate": 3.045762165469168e-06,
+      "loss": 0.5845,
+      "step": 2309
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.3532701249550537,
+      "learning_rate": 3.014113628512982e-06,
+      "loss": 0.5839,
+      "step": 2310
+    },
+    {
+      "epoch": 0.9244,
+      "grad_norm": 0.35176239612432864,
+      "learning_rate": 2.982627865445109e-06,
+      "loss": 0.5754,
+      "step": 2311
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3368894956902543,
+      "learning_rate": 2.9513049291089555e-06,
+      "loss": 0.5937,
+      "step": 2312
+    },
+    {
+      "epoch": 0.9252,
+      "grad_norm": 0.3402801417193046,
+      "learning_rate": 2.9201448720745706e-06,
+      "loss": 0.5909,
+      "step": 2313
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.33648464173002784,
+      "learning_rate": 2.8891477466386986e-06,
+      "loss": 0.5641,
+      "step": 2314
+    },
+    {
+      "epoch": 0.926,
+      "grad_norm": 0.5168140106464583,
+      "learning_rate": 2.8583136048245697e-06,
+      "loss": 0.5442,
+      "step": 2315
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.363563408061145,
+      "learning_rate": 2.827642498381955e-06,
+      "loss": 0.6082,
+      "step": 2316
+    },
+    {
+      "epoch": 0.9268,
+      "grad_norm": 0.3504629693467053,
+      "learning_rate": 2.797134478786911e-06,
+      "loss": 0.6349,
+      "step": 2317
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.3616532069744127,
+      "learning_rate": 2.76678959724187e-06,
+      "loss": 0.6088,
+      "step": 2318
+    },
+    {
+      "epoch": 0.9276,
+      "grad_norm": 0.3507151585532411,
+      "learning_rate": 2.7366079046753924e-06,
+      "loss": 0.5617,
+      "step": 2319
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.340841838394149,
+      "learning_rate": 2.706589451742181e-06,
+      "loss": 0.5989,
+      "step": 2320
+    },
+    {
+      "epoch": 0.9284,
+      "grad_norm": 0.335911542938969,
+      "learning_rate": 2.6767342888229908e-06,
+      "loss": 0.5318,
+      "step": 2321
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.3734278475275637,
+      "learning_rate": 2.647042466024485e-06,
+      "loss": 0.6167,
+      "step": 2322
+    },
+    {
+      "epoch": 0.9292,
+      "grad_norm": 0.3364953235443596,
+      "learning_rate": 2.617514033179236e-06,
+      "loss": 0.5745,
+      "step": 2323
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.3365097083040503,
+      "learning_rate": 2.5881490398455332e-06,
+      "loss": 0.581,
+      "step": 2324
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.3501391841424808,
+      "learning_rate": 2.5589475353073988e-06,
+      "loss": 0.6072,
+      "step": 2325
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.33519945683522734,
+      "learning_rate": 2.5299095685744735e-06,
+      "loss": 0.5621,
+      "step": 2326
+    },
+    {
+      "epoch": 0.9308,
+      "grad_norm": 0.3398021668968586,
+      "learning_rate": 2.5010351883819284e-06,
+      "loss": 0.598,
+      "step": 2327
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.35673857756315136,
+      "learning_rate": 2.472324443190355e-06,
+      "loss": 0.6154,
+      "step": 2328
+    },
+    {
+      "epoch": 0.9316,
+      "grad_norm": 0.33820518950481704,
+      "learning_rate": 2.44377738118573e-06,
+      "loss": 0.5879,
+      "step": 2329
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.3528059102780005,
+      "learning_rate": 2.415394050279318e-06,
+      "loss": 0.5682,
+      "step": 2330
+    },
+    {
+      "epoch": 0.9324,
+      "grad_norm": 0.34393419476168063,
+      "learning_rate": 2.3871744981076136e-06,
+      "loss": 0.5976,
+      "step": 2331
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3605738869567883,
+      "learning_rate": 2.359118772032176e-06,
+      "loss": 0.6091,
+      "step": 2332
+    },
+    {
+      "epoch": 0.9332,
+      "grad_norm": 0.3200807481244181,
+      "learning_rate": 2.331226919139662e-06,
+      "loss": 0.5778,
+      "step": 2333
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.33176492826326637,
+      "learning_rate": 2.30349898624167e-06,
+      "loss": 0.5517,
+      "step": 2334
+    },
+    {
+      "epoch": 0.934,
+      "grad_norm": 0.3647925866698948,
+      "learning_rate": 2.2759350198746976e-06,
+      "loss": 0.5866,
+      "step": 2335
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3489184401906757,
+      "learning_rate": 2.2485350663000725e-06,
+      "loss": 0.5707,
+      "step": 2336
+    },
+    {
+      "epoch": 0.9348,
+      "grad_norm": 0.33620195135863973,
+      "learning_rate": 2.2212991715038324e-06,
+      "loss": 0.5142,
+      "step": 2337
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.3366421335199147,
+      "learning_rate": 2.1942273811966563e-06,
+      "loss": 0.5633,
+      "step": 2338
+    },
+    {
+      "epoch": 0.9356,
+      "grad_norm": 0.3504606894013506,
+      "learning_rate": 2.1673197408138115e-06,
+      "loss": 0.5986,
+      "step": 2339
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3643680697201967,
+      "learning_rate": 2.1405762955151176e-06,
+      "loss": 0.5975,
+      "step": 2340
+    },
+    {
+      "epoch": 0.9364,
+      "grad_norm": 0.36971268743873886,
+      "learning_rate": 2.1139970901847606e-06,
+      "loss": 0.6056,
+      "step": 2341
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.3205072351313767,
+      "learning_rate": 2.0875821694313013e-06,
+      "loss": 0.5336,
+      "step": 2342
+    },
+    {
+      "epoch": 0.9372,
+      "grad_norm": 0.3281437642775695,
+      "learning_rate": 2.061331577587566e-06,
+      "loss": 0.5739,
+      "step": 2343
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.35423646459317043,
+      "learning_rate": 2.035245358710591e-06,
+      "loss": 0.5491,
+      "step": 2344
+    },
+    {
+      "epoch": 0.938,
+      "grad_norm": 0.33130157775600444,
+      "learning_rate": 2.009323556581566e-06,
+      "loss": 0.5358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.34238238221209616,
+      "learning_rate": 1.983566214705701e-06,
+      "loss": 0.5751,
+      "step": 2346
+    },
+    {
+      "epoch": 0.9388,
+      "grad_norm": 0.34453265639186,
+      "learning_rate": 1.9579733763121944e-06,
+      "loss": 0.5756,
+      "step": 2347
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.32732656269296073,
+      "learning_rate": 1.9325450843541536e-06,
+      "loss": 0.5608,
+      "step": 2348
+    },
+    {
+      "epoch": 0.9396,
+      "grad_norm": 0.3205611924412672,
+      "learning_rate": 1.9072813815085523e-06,
+      "loss": 0.572,
+      "step": 2349
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.3421977666997643,
+      "learning_rate": 1.882182310176095e-06,
+      "loss": 0.5506,
+      "step": 2350
+    },
+    {
+      "epoch": 0.9404,
+      "grad_norm": 0.33785091419659685,
+      "learning_rate": 1.857247912481197e-06,
+      "loss": 0.5786,
+      "step": 2351
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.34156770747211374,
+      "learning_rate": 1.8324782302718834e-06,
+      "loss": 0.5906,
+      "step": 2352
+    },
+    {
+      "epoch": 0.9412,
+      "grad_norm": 0.35953608773240814,
+      "learning_rate": 1.807873305119756e-06,
+      "loss": 0.5629,
+      "step": 2353
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.33150979888496007,
+      "learning_rate": 1.7834331783198932e-06,
+      "loss": 0.539,
+      "step": 2354
+    },
+    {
+      "epoch": 0.942,
+      "grad_norm": 0.34936801364046166,
+      "learning_rate": 1.7591578908907724e-06,
+      "loss": 0.6048,
+      "step": 2355
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3716423177680825,
+      "learning_rate": 1.7350474835742147e-06,
+      "loss": 0.6283,
+      "step": 2356
+    },
+    {
+      "epoch": 0.9428,
+      "grad_norm": 0.34291629219173464,
+      "learning_rate": 1.7111019968353626e-06,
+      "loss": 0.5801,
+      "step": 2357
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.3755431906469608,
+      "learning_rate": 1.687321470862524e-06,
+      "loss": 0.6079,
+      "step": 2358
+    },
+    {
+      "epoch": 0.9436,
+      "grad_norm": 0.3906127082745325,
+      "learning_rate": 1.6637059455671622e-06,
+      "loss": 0.6016,
+      "step": 2359
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.34306552212420544,
+      "learning_rate": 1.6402554605838172e-06,
+      "loss": 0.6368,
+      "step": 2360
+    },
+    {
+      "epoch": 0.9444,
+      "grad_norm": 0.34742338711971815,
+      "learning_rate": 1.6169700552700284e-06,
+      "loss": 0.6125,
+      "step": 2361
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.3618315613050588,
+      "learning_rate": 1.5938497687062904e-06,
+      "loss": 0.6126,
+      "step": 2362
+    },
+    {
+      "epoch": 0.9452,
+      "grad_norm": 0.337892601102626,
+      "learning_rate": 1.5708946396959856e-06,
+      "loss": 0.5933,
+      "step": 2363
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.35492727607163377,
+      "learning_rate": 1.5481047067652743e-06,
+      "loss": 0.6258,
+      "step": 2364
+    },
+    {
+      "epoch": 0.946,
+      "grad_norm": 0.37793937027747654,
+      "learning_rate": 1.5254800081630826e-06,
+      "loss": 0.5969,
+      "step": 2365
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.33236758505131336,
+      "learning_rate": 1.5030205818610254e-06,
+      "loss": 0.6169,
+      "step": 2366
+    },
+    {
+      "epoch": 0.9468,
+      "grad_norm": 0.3740054825973699,
+      "learning_rate": 1.4807264655533281e-06,
+      "loss": 0.56,
+      "step": 2367
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3341365422221843,
+      "learning_rate": 1.4585976966567828e-06,
+      "loss": 0.5486,
+      "step": 2368
+    },
+    {
+      "epoch": 0.9476,
+      "grad_norm": 0.34825714160269117,
+      "learning_rate": 1.4366343123106695e-06,
+      "loss": 0.6052,
+      "step": 2369
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.35285631594915745,
+      "learning_rate": 1.4148363493766802e-06,
+      "loss": 0.598,
+      "step": 2370
+    },
+    {
+      "epoch": 0.9484,
+      "grad_norm": 0.3573694612073983,
+      "learning_rate": 1.3932038444389062e-06,
+      "loss": 0.5494,
+      "step": 2371
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.3695274957618561,
+      "learning_rate": 1.3717368338037161e-06,
+      "loss": 0.5738,
+      "step": 2372
+    },
+    {
+      "epoch": 0.9492,
+      "grad_norm": 0.3818659467699447,
+      "learning_rate": 1.3504353534997683e-06,
+      "loss": 0.6347,
+      "step": 2373
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.39038771420550783,
+      "learning_rate": 1.3292994392778536e-06,
+      "loss": 0.5303,
+      "step": 2374
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.3677982950208698,
+      "learning_rate": 1.30832912661093e-06,
+      "loss": 0.6208,
+      "step": 2375
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.32153789890989554,
+      "learning_rate": 1.2875244506940109e-06,
+      "loss": 0.5584,
+      "step": 2376
+    },
+    {
+      "epoch": 0.9508,
+      "grad_norm": 0.3301731661423886,
+      "learning_rate": 1.2668854464441104e-06,
+      "loss": 0.5679,
+      "step": 2377
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.3558794472870126,
+      "learning_rate": 1.2464121485001978e-06,
+      "loss": 0.5833,
+      "step": 2378
+    },
+    {
+      "epoch": 0.9516,
+      "grad_norm": 0.33375321358163346,
+      "learning_rate": 1.2261045912231318e-06,
+      "loss": 0.5675,
+      "step": 2379
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.35354128098230686,
+      "learning_rate": 1.2059628086956044e-06,
+      "loss": 0.5916,
+      "step": 2380
+    },
+    {
+      "epoch": 0.9524,
+      "grad_norm": 0.3491234364844722,
+      "learning_rate": 1.1859868347220749e-06,
+      "loss": 0.5967,
+      "step": 2381
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.33956987922311915,
+      "learning_rate": 1.1661767028287363e-06,
+      "loss": 0.5891,
+      "step": 2382
+    },
+    {
+      "epoch": 0.9532,
+      "grad_norm": 0.3426281690207688,
+      "learning_rate": 1.1465324462634375e-06,
+      "loss": 0.5967,
+      "step": 2383
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.38649212119752785,
+      "learning_rate": 1.1270540979956502e-06,
+      "loss": 0.5573,
+      "step": 2384
+    },
+    {
+      "epoch": 0.954,
+      "grad_norm": 0.3482598521397009,
+      "learning_rate": 1.1077416907163574e-06,
+      "loss": 0.6243,
+      "step": 2385
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.34723913994768063,
+      "learning_rate": 1.0885952568380764e-06,
+      "loss": 0.5999,
+      "step": 2386
+    },
+    {
+      "epoch": 0.9548,
+      "grad_norm": 0.3420298689457409,
+      "learning_rate": 1.0696148284947694e-06,
+      "loss": 0.5875,
+      "step": 2387
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.3475456256380816,
+      "learning_rate": 1.0508004375417546e-06,
+      "loss": 0.5913,
+      "step": 2388
+    },
+    {
+      "epoch": 0.9556,
+      "grad_norm": 0.3405188863402418,
+      "learning_rate": 1.0321521155557179e-06,
+      "loss": 0.5335,
+      "step": 2389
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.35529313752021524,
+      "learning_rate": 1.0136698938346011e-06,
+      "loss": 0.5859,
+      "step": 2390
+    },
+    {
+      "epoch": 0.9564,
+      "grad_norm": 0.3430242218150067,
+      "learning_rate": 9.953538033975918e-07,
+      "loss": 0.6045,
+      "step": 2391
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.348871926036714,
+      "learning_rate": 9.772038749850665e-07,
+      "loss": 0.572,
+      "step": 2392
+    },
+    {
+      "epoch": 0.9572,
+      "grad_norm": 0.35317078131207735,
+      "learning_rate": 9.59220139058492e-07,
+      "loss": 0.6179,
+      "step": 2393
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.323743943833337,
+      "learning_rate": 9.414026258004582e-07,
+      "loss": 0.5333,
+      "step": 2394
+    },
+    {
+      "epoch": 0.958,
+      "grad_norm": 0.33483465085793146,
+      "learning_rate": 9.237513651145225e-07,
+      "loss": 0.5729,
+      "step": 2395
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3253237012665603,
+      "learning_rate": 9.062663866252541e-07,
+      "loss": 0.5949,
+      "step": 2396
+    },
+    {
+      "epoch": 0.9588,
+      "grad_norm": 0.3325136181038339,
+      "learning_rate": 8.889477196781571e-07,
+      "loss": 0.5344,
+      "step": 2397
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.35296644109808906,
+      "learning_rate": 8.717953933395694e-07,
+      "loss": 0.605,
+      "step": 2398
+    },
+    {
+      "epoch": 0.9596,
+      "grad_norm": 0.3390970266581428,
+      "learning_rate": 8.548094363966974e-07,
+      "loss": 0.5447,
+      "step": 2399
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.36166833688023653,
+      "learning_rate": 8.379898773574924e-07,
+      "loss": 0.5642,
+      "step": 2400
+    },
+    {
+      "epoch": 0.9604,
+      "grad_norm": 0.42843968940094684,
+      "learning_rate": 8.213367444506515e-07,
+      "loss": 0.5747,
+      "step": 2401
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.3472675809359243,
+      "learning_rate": 8.048500656255509e-07,
+      "loss": 0.6021,
+      "step": 2402
+    },
+    {
+      "epoch": 0.9612,
+      "grad_norm": 0.37501497520210736,
+      "learning_rate": 7.885298685522235e-07,
+      "loss": 0.5626,
+      "step": 2403
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.36453428855164266,
+      "learning_rate": 7.72376180621237e-07,
+      "loss": 0.5585,
+      "step": 2404
+    },
+    {
+      "epoch": 0.962,
+      "grad_norm": 0.36866230726234994,
+      "learning_rate": 7.563890289437825e-07,
+      "loss": 0.6233,
+      "step": 2405
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.357034665309984,
+      "learning_rate": 7.405684403514634e-07,
+      "loss": 0.5996,
+      "step": 2406
+    },
+    {
+      "epoch": 0.9628,
+      "grad_norm": 0.3462692994450068,
+      "learning_rate": 7.24914441396396e-07,
+      "loss": 0.5877,
+      "step": 2407
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3442435233311984,
+      "learning_rate": 7.094270583510975e-07,
+      "loss": 0.5682,
+      "step": 2408
+    },
+    {
+      "epoch": 0.9636,
+      "grad_norm": 0.38064657260176105,
+      "learning_rate": 6.941063172084095e-07,
+      "loss": 0.5741,
+      "step": 2409
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.3534661622519689,
+      "learning_rate": 6.78952243681541e-07,
+      "loss": 0.6255,
+      "step": 2410
+    },
+    {
+      "epoch": 0.9644,
+      "grad_norm": 0.3328590047781539,
+      "learning_rate": 6.639648632039697e-07,
+      "loss": 0.5752,
+      "step": 2411
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.3429833811094635,
+      "learning_rate": 6.491442009293858e-07,
+      "loss": 0.6012,
+      "step": 2412
+    },
+    {
+      "epoch": 0.9652,
+      "grad_norm": 0.3561655404495796,
+      "learning_rate": 6.344902817316812e-07,
+      "loss": 0.5503,
+      "step": 2413
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.33775128399922205,
+      "learning_rate": 6.200031302049047e-07,
+      "loss": 0.5851,
+      "step": 2414
+    },
+    {
+      "epoch": 0.966,
+      "grad_norm": 0.3627069667953671,
+      "learning_rate": 6.056827706632185e-07,
+      "loss": 0.6216,
+      "step": 2415
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3493171408060569,
+      "learning_rate": 5.915292271408524e-07,
+      "loss": 0.6179,
+      "step": 2416
+    },
+    {
+      "epoch": 0.9668,
+      "grad_norm": 0.362713924190697,
+      "learning_rate": 5.775425233920495e-07,
+      "loss": 0.5923,
+      "step": 2417
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.3883190641575212,
+      "learning_rate": 5.637226828910436e-07,
+      "loss": 0.5861,
+      "step": 2418
+    },
+    {
+      "epoch": 0.9676,
+      "grad_norm": 0.34342397440804545,
+      "learning_rate": 5.500697288320478e-07,
+      "loss": 0.5277,
+      "step": 2419
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.34098783955612,
+      "learning_rate": 5.365836841291438e-07,
+      "loss": 0.5932,
+      "step": 2420
+    },
+    {
+      "epoch": 0.9684,
+      "grad_norm": 0.37181080315426046,
+      "learning_rate": 5.232645714163265e-07,
+      "loss": 0.5796,
+      "step": 2421
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.34473297284696364,
+      "learning_rate": 5.101124130473811e-07,
+      "loss": 0.5895,
+      "step": 2422
+    },
+    {
+      "epoch": 0.9692,
+      "grad_norm": 0.37410772340681114,
+      "learning_rate": 4.971272310959063e-07,
+      "loss": 0.5757,
+      "step": 2423
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3653164649937013,
+      "learning_rate": 4.843090473552913e-07,
+      "loss": 0.6452,
+      "step": 2424
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.3593211432808053,
+      "learning_rate": 4.7165788333860536e-07,
+      "loss": 0.5985,
+      "step": 2425
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.35245013507684075,
+      "learning_rate": 4.5917376027861945e-07,
+      "loss": 0.5228,
+      "step": 2426
+    },
+    {
+      "epoch": 0.9708,
+      "grad_norm": 0.3537624277112952,
+      "learning_rate": 4.468566991277512e-07,
+      "loss": 0.5949,
+      "step": 2427
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.32705621542268176,
+      "learning_rate": 4.347067205580424e-07,
+      "loss": 0.5503,
+      "step": 2428
+    },
+    {
+      "epoch": 0.9716,
+      "grad_norm": 0.3382575227276866,
+      "learning_rate": 4.2272384496112594e-07,
+      "loss": 0.5573,
+      "step": 2429
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.3584734009623166,
+      "learning_rate": 4.1090809244814785e-07,
+      "loss": 0.5704,
+      "step": 2430
+    },
+    {
+      "epoch": 0.9724,
+      "grad_norm": 0.32131468623218323,
+      "learning_rate": 3.9925948284980086e-07,
+      "loss": 0.566,
+      "step": 2431
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.35002803250987646,
+      "learning_rate": 3.877780357162353e-07,
+      "loss": 0.6152,
+      "step": 2432
+    },
+    {
+      "epoch": 0.9732,
+      "grad_norm": 0.3511181384629274,
+      "learning_rate": 3.7646377031705924e-07,
+      "loss": 0.5882,
+      "step": 2433
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.34404651785041235,
+      "learning_rate": 3.653167056413054e-07,
+      "loss": 0.6157,
+      "step": 2434
+    },
+    {
+      "epoch": 0.974,
+      "grad_norm": 0.3379154238877894,
+      "learning_rate": 3.543368603973529e-07,
+      "loss": 0.5449,
+      "step": 2435
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3381776255081913,
+      "learning_rate": 3.4352425301297233e-07,
+      "loss": 0.5812,
+      "step": 2436
+    },
+    {
+      "epoch": 0.9748,
+      "grad_norm": 0.34012625463195734,
+      "learning_rate": 3.3287890163523626e-07,
+      "loss": 0.5641,
+      "step": 2437
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.36650941460648767,
+      "learning_rate": 3.2240082413049765e-07,
+      "loss": 0.6161,
+      "step": 2438
+    },
+    {
+      "epoch": 0.9756,
+      "grad_norm": 0.34436265712860376,
+      "learning_rate": 3.120900380844116e-07,
+      "loss": 0.597,
+      "step": 2439
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.32650237175810803,
+      "learning_rate": 3.019465608018024e-07,
+      "loss": 0.5198,
+      "step": 2440
+    },
+    {
+      "epoch": 0.9764,
+      "grad_norm": 0.32621853345125035,
+      "learning_rate": 2.91970409306741e-07,
+      "loss": 0.5791,
+      "step": 2441
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.3340739671871019,
+      "learning_rate": 2.8216160034244543e-07,
+      "loss": 0.5695,
+      "step": 2442
+    },
+    {
+      "epoch": 0.9772,
+      "grad_norm": 0.3516249067193672,
+      "learning_rate": 2.7252015037131375e-07,
+      "loss": 0.5613,
+      "step": 2443
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3548198648851666,
+      "learning_rate": 2.630460755748132e-07,
+      "loss": 0.6438,
+      "step": 2444
+    },
+    {
+      "epoch": 0.978,
+      "grad_norm": 0.33513881905909393,
+      "learning_rate": 2.537393918535358e-07,
+      "loss": 0.5676,
+      "step": 2445
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.3448911204069862,
+      "learning_rate": 2.4460011482713153e-07,
+      "loss": 0.5809,
+      "step": 2446
+    },
+    {
+      "epoch": 0.9788,
+      "grad_norm": 0.32549122686674664,
+      "learning_rate": 2.3562825983427516e-07,
+      "loss": 0.5408,
+      "step": 2447
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3352492333611208,
+      "learning_rate": 2.2682384193266626e-07,
+      "loss": 0.5787,
+      "step": 2448
+    },
+    {
+      "epoch": 0.9796,
+      "grad_norm": 0.3511330639105213,
+      "learning_rate": 2.1818687589896246e-07,
+      "loss": 0.5626,
+      "step": 2449
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.3334275035098752,
+      "learning_rate": 2.0971737622883515e-07,
+      "loss": 0.5496,
+      "step": 2450
+    },
+    {
+      "epoch": 0.9804,
+      "grad_norm": 0.3318676137989981,
+      "learning_rate": 2.01415357136836e-07,
+      "loss": 0.5564,
+      "step": 2451
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.39134888539753543,
+      "learning_rate": 1.93280832556475e-07,
+      "loss": 0.6444,
+      "step": 2452
+    },
+    {
+      "epoch": 0.9812,
+      "grad_norm": 0.35476133630040274,
+      "learning_rate": 1.853138161401313e-07,
+      "loss": 0.6449,
+      "step": 2453
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.34844717565920313,
+      "learning_rate": 1.7751432125903134e-07,
+      "loss": 0.6005,
+      "step": 2454
+    },
+    {
+      "epoch": 0.982,
+      "grad_norm": 0.3451148912982019,
+      "learning_rate": 1.6988236100329292e-07,
+      "loss": 0.6096,
+      "step": 2455
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3760768751207625,
+      "learning_rate": 1.6241794818180333e-07,
+      "loss": 0.5715,
+      "step": 2456
+    },
+    {
+      "epoch": 0.9828,
+      "grad_norm": 0.34236614555096034,
+      "learning_rate": 1.5512109532229702e-07,
+      "loss": 0.5783,
+      "step": 2457
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.3648978677422674,
+      "learning_rate": 1.4799181467125556e-07,
+      "loss": 0.6236,
+      "step": 2458
+    },
+    {
+      "epoch": 0.9836,
+      "grad_norm": 0.3382695517918823,
+      "learning_rate": 1.4103011819395217e-07,
+      "loss": 0.6144,
+      "step": 2459
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.33547126455341314,
+      "learning_rate": 1.3423601757436287e-07,
+      "loss": 0.5946,
+      "step": 2460
+    },
+    {
+      "epoch": 0.9844,
+      "grad_norm": 0.35722960414129934,
+      "learning_rate": 1.276095242151998e-07,
+      "loss": 0.5891,
+      "step": 2461
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.35703466561425545,
+      "learning_rate": 1.211506492378778e-07,
+      "loss": 0.5573,
+      "step": 2462
+    },
+    {
+      "epoch": 0.9852,
+      "grad_norm": 0.3600369050933721,
+      "learning_rate": 1.1485940348249235e-07,
+      "loss": 0.5731,
+      "step": 2463
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.35005886164808475,
+      "learning_rate": 1.0873579750780849e-07,
+      "loss": 0.6015,
+      "step": 2464
+    },
+    {
+      "epoch": 0.986,
+      "grad_norm": 0.38173089236257796,
+      "learning_rate": 1.0277984159122733e-07,
+      "loss": 0.5207,
+      "step": 2465
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.3389675973408878,
+      "learning_rate": 9.699154572877511e-08,
+      "loss": 0.5788,
+      "step": 2466
+    },
+    {
+      "epoch": 0.9868,
+      "grad_norm": 0.32971814138367084,
+      "learning_rate": 9.137091963510314e-08,
+      "loss": 0.5361,
+      "step": 2467
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.3278269739115661,
+      "learning_rate": 8.591797274344338e-08,
+      "loss": 0.5783,
+      "step": 2468
+    },
+    {
+      "epoch": 0.9876,
+      "grad_norm": 0.35134902862350553,
+      "learning_rate": 8.063271420563068e-08,
+      "loss": 0.5904,
+      "step": 2469
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.3282824064837508,
+      "learning_rate": 7.551515289203615e-08,
+      "loss": 0.5787,
+      "step": 2470
+    },
+    {
+      "epoch": 0.9884,
+      "grad_norm": 0.39575686355806367,
+      "learning_rate": 7.056529739158934e-08,
+      "loss": 0.5738,
+      "step": 2471
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.34569557216135277,
+      "learning_rate": 6.578315601177831e-08,
+      "loss": 0.5655,
+      "step": 2472
+    },
+    {
+      "epoch": 0.9892,
+      "grad_norm": 0.35208160623381174,
+      "learning_rate": 6.116873677858292e-08,
+      "loss": 0.5656,
+      "step": 2473
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.3689703449415024,
+      "learning_rate": 5.6722047436497116e-08,
+      "loss": 0.546,
+      "step": 2474
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.3988292102292916,
+      "learning_rate": 5.2443095448506674e-08,
+      "loss": 0.5201,
+      "step": 2475
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.3610586624211906,
+      "learning_rate": 4.8331887996100336e-08,
+      "loss": 0.6618,
+      "step": 2476
+    },
+    {
+      "epoch": 0.9908,
+      "grad_norm": 0.3610758732511304,
+      "learning_rate": 4.438843197922538e-08,
+      "loss": 0.5826,
+      "step": 2477
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.3249391309986885,
+      "learning_rate": 4.061273401627652e-08,
+      "loss": 0.5627,
+      "step": 2478
+    },
+    {
+      "epoch": 0.9916,
+      "grad_norm": 0.37379325075132186,
+      "learning_rate": 3.7004800444095935e-08,
+      "loss": 0.6008,
+      "step": 2479
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3391315745868408,
+      "learning_rate": 3.356463731798432e-08,
+      "loss": 0.5779,
+      "step": 2480
+    },
+    {
+      "epoch": 0.9924,
+      "grad_norm": 0.3519546846907048,
+      "learning_rate": 3.0292250411645404e-08,
+      "loss": 0.5718,
+      "step": 2481
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.3654826071464851,
+      "learning_rate": 2.718764521721928e-08,
+      "loss": 0.5915,
+      "step": 2482
+    },
+    {
+      "epoch": 0.9932,
+      "grad_norm": 0.3339716780119255,
+      "learning_rate": 2.4250826945226847e-08,
+      "loss": 0.531,
+      "step": 2483
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3591835383870838,
+      "learning_rate": 2.148180052462534e-08,
+      "loss": 0.5962,
+      "step": 2484
+    },
+    {
+      "epoch": 0.994,
+      "grad_norm": 0.32906026676200706,
+      "learning_rate": 1.888057060274173e-08,
+      "loss": 0.5505,
+      "step": 2485
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.33556356391048286,
+      "learning_rate": 1.6447141545272715e-08,
+      "loss": 0.5274,
+      "step": 2486
+    },
+    {
+      "epoch": 0.9948,
+      "grad_norm": 0.3714304373448778,
+      "learning_rate": 1.4181517436306912e-08,
+      "loss": 0.6015,
+      "step": 2487
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3610303975888095,
+      "learning_rate": 1.2083702078302672e-08,
+      "loss": 0.5901,
+      "step": 2488
+    },
+    {
+      "epoch": 0.9956,
+      "grad_norm": 0.3408485582669479,
+      "learning_rate": 1.0153698992088068e-08,
+      "loss": 0.5576,
+      "step": 2489
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.47199005918345777,
+      "learning_rate": 8.391511416816489e-09,
+      "loss": 0.5967,
+      "step": 2490
+    },
+    {
+      "epoch": 0.9964,
+      "grad_norm": 0.35849436583308514,
+      "learning_rate": 6.797142310022153e-09,
+      "loss": 0.563,
+      "step": 2491
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.3162945272534749,
+      "learning_rate": 5.370594347575697e-09,
+      "loss": 0.5457,
+      "step": 2492
+    },
+    {
+      "epoch": 0.9972,
+      "grad_norm": 0.34495301983804705,
+      "learning_rate": 4.111869923684175e-09,
+      "loss": 0.5793,
+      "step": 2493
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.3482463638127228,
+      "learning_rate": 3.0209711509132654e-09,
+      "loss": 0.6114,
+      "step": 2494
+    },
+    {
+      "epoch": 0.998,
+      "grad_norm": 0.34834264419494415,
+      "learning_rate": 2.0978998601206556e-09,
+      "loss": 0.5633,
+      "step": 2495
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.37229288685720197,
+      "learning_rate": 1.342657600544861e-09,
+      "loss": 0.6379,
+      "step": 2496
+    },
+    {
+      "epoch": 0.9988,
+      "grad_norm": 0.32919562198756436,
+      "learning_rate": 7.552456397053043e-10,
+      "loss": 0.539,
+      "step": 2497
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.35739179206642846,
+      "learning_rate": 3.3566496349113353e-10,
+      "loss": 0.6179,
+      "step": 2498
+    },
+    {
+      "epoch": 0.9996,
+      "grad_norm": 0.377010254391696,
+      "learning_rate": 8.391627608350661e-11,
+      "loss": 0.6228,
+      "step": 2499
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.36175449337551296,
+      "learning_rate": 0.0,
+      "loss": 0.6089,
+      "step": 2500
+    },
+    {
+      "epoch": 1.0,
+      "step": 2500,
+      "total_flos": 2225655608836096.0,
+      "train_loss": 0.6471238312482834,
+      "train_runtime": 39645.3417,
+      "train_samples_per_second": 1.009,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 2500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2225655608836096.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..509139647e40f99c318a4486c4fd46b95c571fbc
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "gate_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..76f2f8522bd15a8067d741ce043c1cb784403ec4
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2929435931b50af416e3c32a44e2be03d7c0e670a8dcac5ca650b01382413dff
+size 671150064
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cf7c6c0c6a545a788c84b1a27a7646822f148cac
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2002d93f399005e211b3ff01fdaca15d40b94b01b86add68364e0a8731bb38df
+size 918507402
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b7ce7ff70e213d78a3f684616aa6b58a9945a3b
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,21917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00032,
+      "grad_norm": 1.1731204456473057,
+      "learning_rate": 2.1276595744680853e-06,
+      "loss": 1.5375,
+      "step": 1
+    },
+    {
+      "epoch": 0.00064,
+      "grad_norm": 1.0874451922760042,
+      "learning_rate": 4.255319148936171e-06,
+      "loss": 1.5737,
+      "step": 2
+    },
+    {
+      "epoch": 0.00096,
+      "grad_norm": 1.1522666344335273,
+      "learning_rate": 6.3829787234042555e-06,
+      "loss": 1.574,
+      "step": 3
+    },
+    {
+      "epoch": 0.00128,
+      "grad_norm": 1.1438797548320527,
+      "learning_rate": 8.510638297872341e-06,
+      "loss": 1.5675,
+      "step": 4
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1527978191782793,
+      "learning_rate": 1.0638297872340426e-05,
+      "loss": 1.4695,
+      "step": 5
+    },
+    {
+      "epoch": 0.00192,
+      "grad_norm": 1.030976505512538,
+      "learning_rate": 1.2765957446808511e-05,
+      "loss": 1.5025,
+      "step": 6
+    },
+    {
+      "epoch": 0.00224,
+      "grad_norm": 0.8994453718426768,
+      "learning_rate": 1.4893617021276596e-05,
+      "loss": 1.4405,
+      "step": 7
+    },
+    {
+      "epoch": 0.00256,
+      "grad_norm": 0.9866812823506541,
+      "learning_rate": 1.7021276595744682e-05,
+      "loss": 1.4088,
+      "step": 8
+    },
+    {
+      "epoch": 0.00288,
+      "grad_norm": 1.005930125260958,
+      "learning_rate": 1.9148936170212766e-05,
+      "loss": 1.3633,
+      "step": 9
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.002795962550857,
+      "learning_rate": 2.1276595744680852e-05,
+      "loss": 1.2755,
+      "step": 10
+    },
+    {
+      "epoch": 0.00352,
+      "grad_norm": 0.9506223845208505,
+      "learning_rate": 2.340425531914894e-05,
+      "loss": 1.2413,
+      "step": 11
+    },
+    {
+      "epoch": 0.00384,
+      "grad_norm": 1.002996990567505,
+      "learning_rate": 2.5531914893617022e-05,
+      "loss": 1.1836,
+      "step": 12
+    },
+    {
+      "epoch": 0.00416,
+      "grad_norm": 1.2914749552786817,
+      "learning_rate": 2.765957446808511e-05,
+      "loss": 1.1046,
+      "step": 13
+    },
+    {
+      "epoch": 0.00448,
+      "grad_norm": 0.8620511781430058,
+      "learning_rate": 2.9787234042553192e-05,
+      "loss": 0.9993,
+      "step": 14
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8815276678933675,
+      "learning_rate": 3.191489361702128e-05,
+      "loss": 1.0141,
+      "step": 15
+    },
+    {
+      "epoch": 0.00512,
+      "grad_norm": 0.8892144594051491,
+      "learning_rate": 3.4042553191489365e-05,
+      "loss": 0.9935,
+      "step": 16
+    },
+    {
+      "epoch": 0.00544,
+      "grad_norm": 0.8060622568937051,
+      "learning_rate": 3.617021276595745e-05,
+      "loss": 1.0498,
+      "step": 17
+    },
+    {
+      "epoch": 0.00576,
+      "grad_norm": 0.7717226275464889,
+      "learning_rate": 3.829787234042553e-05,
+      "loss": 0.9595,
+      "step": 18
+    },
+    {
+      "epoch": 0.00608,
+      "grad_norm": 0.7214390063255575,
+      "learning_rate": 4.0425531914893614e-05,
+      "loss": 0.9034,
+      "step": 19
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6569777573026117,
+      "learning_rate": 4.2553191489361704e-05,
+      "loss": 0.9259,
+      "step": 20
+    },
+    {
+      "epoch": 0.00672,
+      "grad_norm": 0.6105655595251536,
+      "learning_rate": 4.468085106382979e-05,
+      "loss": 1.0036,
+      "step": 21
+    },
+    {
+      "epoch": 0.00704,
+      "grad_norm": 0.5500381506972107,
+      "learning_rate": 4.680851063829788e-05,
+      "loss": 0.8723,
+      "step": 22
+    },
+    {
+      "epoch": 0.00736,
+      "grad_norm": 0.5960513728068643,
+      "learning_rate": 4.893617021276596e-05,
+      "loss": 1.0233,
+      "step": 23
+    },
+    {
+      "epoch": 0.00768,
+      "grad_norm": 0.5967296068618473,
+      "learning_rate": 5.1063829787234044e-05,
+      "loss": 0.8992,
+      "step": 24
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.519443074201136,
+      "learning_rate": 5.319148936170213e-05,
+      "loss": 0.8753,
+      "step": 25
+    },
+    {
+      "epoch": 0.00832,
+      "grad_norm": 0.4925062984123281,
+      "learning_rate": 5.531914893617022e-05,
+      "loss": 0.8894,
+      "step": 26
+    },
+    {
+      "epoch": 0.00864,
+      "grad_norm": 0.5023296539264742,
+      "learning_rate": 5.744680851063831e-05,
+      "loss": 0.8267,
+      "step": 27
+    },
+    {
+      "epoch": 0.00896,
+      "grad_norm": 0.49306587234290833,
+      "learning_rate": 5.9574468085106384e-05,
+      "loss": 0.8744,
+      "step": 28
+    },
+    {
+      "epoch": 0.00928,
+      "grad_norm": 0.48486792040650717,
+      "learning_rate": 6.170212765957447e-05,
+      "loss": 0.8754,
+      "step": 29
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.5032819920258179,
+      "learning_rate": 6.382978723404256e-05,
+      "loss": 0.8425,
+      "step": 30
+    },
+    {
+      "epoch": 0.00992,
+      "grad_norm": 0.5246804064999607,
+      "learning_rate": 6.595744680851063e-05,
+      "loss": 0.8512,
+      "step": 31
+    },
+    {
+      "epoch": 0.01024,
+      "grad_norm": 0.4932282136254817,
+      "learning_rate": 6.808510638297873e-05,
+      "loss": 0.789,
+      "step": 32
+    },
+    {
+      "epoch": 0.01056,
+      "grad_norm": 0.4818690946226732,
+      "learning_rate": 7.021276595744681e-05,
+      "loss": 0.8577,
+      "step": 33
+    },
+    {
+      "epoch": 0.01088,
+      "grad_norm": 0.5182161730047697,
+      "learning_rate": 7.23404255319149e-05,
+      "loss": 0.8742,
+      "step": 34
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5190136272248325,
+      "learning_rate": 7.446808510638298e-05,
+      "loss": 0.8694,
+      "step": 35
+    },
+    {
+      "epoch": 0.01152,
+      "grad_norm": 0.5013872882456878,
+      "learning_rate": 7.659574468085106e-05,
+      "loss": 0.7884,
+      "step": 36
+    },
+    {
+      "epoch": 0.01184,
+      "grad_norm": 0.4755846116681576,
+      "learning_rate": 7.872340425531916e-05,
+      "loss": 0.8186,
+      "step": 37
+    },
+    {
+      "epoch": 0.01216,
+      "grad_norm": 0.5023490769633178,
+      "learning_rate": 8.085106382978723e-05,
+      "loss": 0.8458,
+      "step": 38
+    },
+    {
+      "epoch": 0.01248,
+      "grad_norm": 0.5506541905065853,
+      "learning_rate": 8.297872340425533e-05,
+      "loss": 0.8365,
+      "step": 39
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.49883353079038545,
+      "learning_rate": 8.510638297872341e-05,
+      "loss": 0.8089,
+      "step": 40
+    },
+    {
+      "epoch": 0.01312,
+      "grad_norm": 0.5032322471078805,
+      "learning_rate": 8.723404255319149e-05,
+      "loss": 0.7647,
+      "step": 41
+    },
+    {
+      "epoch": 0.01344,
+      "grad_norm": 0.5300451060396111,
+      "learning_rate": 8.936170212765958e-05,
+      "loss": 0.8643,
+      "step": 42
+    },
+    {
+      "epoch": 0.01376,
+      "grad_norm": 0.5223459318507665,
+      "learning_rate": 9.148936170212766e-05,
+      "loss": 0.8625,
+      "step": 43
+    },
+    {
+      "epoch": 0.01408,
+      "grad_norm": 0.4933444414366919,
+      "learning_rate": 9.361702127659576e-05,
+      "loss": 0.8369,
+      "step": 44
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.45964304834337194,
+      "learning_rate": 9.574468085106384e-05,
+      "loss": 0.7714,
+      "step": 45
+    },
+    {
+      "epoch": 0.01472,
+      "grad_norm": 0.47183346334080883,
+      "learning_rate": 9.787234042553192e-05,
+      "loss": 0.8145,
+      "step": 46
+    },
+    {
+      "epoch": 0.01504,
+      "grad_norm": 0.4889165264405602,
+      "learning_rate": 0.0001,
+      "loss": 0.8636,
+      "step": 47
+    },
+    {
+      "epoch": 0.01536,
+      "grad_norm": 0.45880842758274565,
+      "learning_rate": 0.00010212765957446809,
+      "loss": 0.8467,
+      "step": 48
+    },
+    {
+      "epoch": 0.01568,
+      "grad_norm": 0.4546382255534783,
+      "learning_rate": 0.00010425531914893618,
+      "loss": 0.7218,
+      "step": 49
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.45789199269746717,
+      "learning_rate": 0.00010638297872340425,
+      "loss": 0.8478,
+      "step": 50
+    },
+    {
+      "epoch": 0.01632,
+      "grad_norm": 0.46500190214603254,
+      "learning_rate": 0.00010851063829787234,
+      "loss": 0.8135,
+      "step": 51
+    },
+    {
+      "epoch": 0.01664,
+      "grad_norm": 0.46975396433824307,
+      "learning_rate": 0.00011063829787234043,
+      "loss": 0.7553,
+      "step": 52
+    },
+    {
+      "epoch": 0.01696,
+      "grad_norm": 0.4818383433872269,
+      "learning_rate": 0.00011276595744680852,
+      "loss": 0.7569,
+      "step": 53
+    },
+    {
+      "epoch": 0.01728,
+      "grad_norm": 0.8761618046394473,
+      "learning_rate": 0.00011489361702127661,
+      "loss": 0.7913,
+      "step": 54
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.43519748375111394,
+      "learning_rate": 0.00011702127659574468,
+      "loss": 0.7454,
+      "step": 55
+    },
+    {
+      "epoch": 0.01792,
+      "grad_norm": 0.44196300412314393,
+      "learning_rate": 0.00011914893617021277,
+      "loss": 0.7915,
+      "step": 56
+    },
+    {
+      "epoch": 0.01824,
+      "grad_norm": 0.4320803614200594,
+      "learning_rate": 0.00012127659574468086,
+      "loss": 0.7815,
+      "step": 57
+    },
+    {
+      "epoch": 0.01856,
+      "grad_norm": 0.434454215620284,
+      "learning_rate": 0.00012340425531914893,
+      "loss": 0.8358,
+      "step": 58
+    },
+    {
+      "epoch": 0.01888,
+      "grad_norm": 0.444109592699101,
+      "learning_rate": 0.00012553191489361702,
+      "loss": 0.8034,
+      "step": 59
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.44321008869127104,
+      "learning_rate": 0.00012765957446808513,
+      "loss": 0.7547,
+      "step": 60
+    },
+    {
+      "epoch": 0.01952,
+      "grad_norm": 0.465332043208612,
+      "learning_rate": 0.00012978723404255318,
+      "loss": 0.8228,
+      "step": 61
+    },
+    {
+      "epoch": 0.01984,
+      "grad_norm": 0.45905730231690656,
+      "learning_rate": 0.00013191489361702127,
+      "loss": 0.7571,
+      "step": 62
+    },
+    {
+      "epoch": 0.02016,
+      "grad_norm": 0.44239081478921777,
+      "learning_rate": 0.00013404255319148938,
+      "loss": 0.7157,
+      "step": 63
+    },
+    {
+      "epoch": 0.02048,
+      "grad_norm": 0.4376818526592498,
+      "learning_rate": 0.00013617021276595746,
+      "loss": 0.7761,
+      "step": 64
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.4641660188193671,
+      "learning_rate": 0.00013829787234042554,
+      "loss": 0.7672,
+      "step": 65
+    },
+    {
+      "epoch": 0.02112,
+      "grad_norm": 0.4437586543840469,
+      "learning_rate": 0.00014042553191489363,
+      "loss": 0.7645,
+      "step": 66
+    },
+    {
+      "epoch": 0.02144,
+      "grad_norm": 0.43784489225170686,
+      "learning_rate": 0.0001425531914893617,
+      "loss": 0.7887,
+      "step": 67
+    },
+    {
+      "epoch": 0.02176,
+      "grad_norm": 0.4348987607939572,
+      "learning_rate": 0.0001446808510638298,
+      "loss": 0.7532,
+      "step": 68
+    },
+    {
+      "epoch": 0.02208,
+      "grad_norm": 0.4439487163648935,
+      "learning_rate": 0.00014680851063829788,
+      "loss": 0.7627,
+      "step": 69
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.43865263620003636,
+      "learning_rate": 0.00014893617021276596,
+      "loss": 0.7902,
+      "step": 70
+    },
+    {
+      "epoch": 0.02272,
+      "grad_norm": 0.42266616421302383,
+      "learning_rate": 0.00015106382978723407,
+      "loss": 0.7184,
+      "step": 71
+    },
+    {
+      "epoch": 0.02304,
+      "grad_norm": 0.4305898725706256,
+      "learning_rate": 0.00015319148936170213,
+      "loss": 0.7616,
+      "step": 72
+    },
+    {
+      "epoch": 0.02336,
+      "grad_norm": 0.44985286747216346,
+      "learning_rate": 0.0001553191489361702,
+      "loss": 0.7949,
+      "step": 73
+    },
+    {
+      "epoch": 0.02368,
+      "grad_norm": 0.4431490716592776,
+      "learning_rate": 0.00015744680851063832,
+      "loss": 0.7553,
+      "step": 74
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.42491119706935265,
+      "learning_rate": 0.00015957446808510637,
+      "loss": 0.784,
+      "step": 75
+    },
+    {
+      "epoch": 0.02432,
+      "grad_norm": 0.42683973187367136,
+      "learning_rate": 0.00016170212765957446,
+      "loss": 0.7743,
+      "step": 76
+    },
+    {
+      "epoch": 0.02464,
+      "grad_norm": 0.4424036302783349,
+      "learning_rate": 0.00016382978723404257,
+      "loss": 0.7532,
+      "step": 77
+    },
+    {
+      "epoch": 0.02496,
+      "grad_norm": 0.4939331210509091,
+      "learning_rate": 0.00016595744680851065,
+      "loss": 0.7956,
+      "step": 78
+    },
+    {
+      "epoch": 0.02528,
+      "grad_norm": 0.451524645795332,
+      "learning_rate": 0.00016808510638297873,
+      "loss": 0.8558,
+      "step": 79
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.46899146025448957,
+      "learning_rate": 0.00017021276595744682,
+      "loss": 0.7842,
+      "step": 80
+    },
+    {
+      "epoch": 0.02592,
+      "grad_norm": 0.4257699612799091,
+      "learning_rate": 0.0001723404255319149,
+      "loss": 0.7888,
+      "step": 81
+    },
+    {
+      "epoch": 0.02624,
+      "grad_norm": 0.4163742024238363,
+      "learning_rate": 0.00017446808510638298,
+      "loss": 0.7359,
+      "step": 82
+    },
+    {
+      "epoch": 0.02656,
+      "grad_norm": 0.43745821044811817,
+      "learning_rate": 0.00017659574468085107,
+      "loss": 0.8091,
+      "step": 83
+    },
+    {
+      "epoch": 0.02688,
+      "grad_norm": 0.46021608034195793,
+      "learning_rate": 0.00017872340425531915,
+      "loss": 0.8086,
+      "step": 84
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.4344927830913337,
+      "learning_rate": 0.00018085106382978726,
+      "loss": 0.7739,
+      "step": 85
+    },
+    {
+      "epoch": 0.02752,
+      "grad_norm": 0.4562564386414459,
+      "learning_rate": 0.00018297872340425532,
+      "loss": 0.8199,
+      "step": 86
+    },
+    {
+      "epoch": 0.02784,
+      "grad_norm": 0.4622391342149164,
+      "learning_rate": 0.0001851063829787234,
+      "loss": 0.7877,
+      "step": 87
+    },
+    {
+      "epoch": 0.02816,
+      "grad_norm": 0.43992782341599485,
+      "learning_rate": 0.0001872340425531915,
+      "loss": 0.7858,
+      "step": 88
+    },
+    {
+      "epoch": 0.02848,
+      "grad_norm": 0.47778531966000015,
+      "learning_rate": 0.00018936170212765957,
+      "loss": 0.7728,
+      "step": 89
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.43201711511780055,
+      "learning_rate": 0.00019148936170212768,
+      "loss": 0.7886,
+      "step": 90
+    },
+    {
+      "epoch": 0.02912,
+      "grad_norm": 0.4051612223053757,
+      "learning_rate": 0.00019361702127659576,
+      "loss": 0.7675,
+      "step": 91
+    },
+    {
+      "epoch": 0.02944,
+      "grad_norm": 0.43458334760183204,
+      "learning_rate": 0.00019574468085106384,
+      "loss": 0.7546,
+      "step": 92
+    },
+    {
+      "epoch": 0.02976,
+      "grad_norm": 0.44083664287493324,
+      "learning_rate": 0.00019787234042553193,
+      "loss": 0.7344,
+      "step": 93
+    },
+    {
+      "epoch": 0.03008,
+      "grad_norm": 0.4384926816131158,
+      "learning_rate": 0.0002,
+      "loss": 0.7635,
+      "step": 94
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.4593902461605018,
+      "learning_rate": 0.00019999994628472071,
+      "loss": 0.783,
+      "step": 95
+    },
+    {
+      "epoch": 0.03072,
+      "grad_norm": 0.40538192170834214,
+      "learning_rate": 0.00019999978513894056,
+      "loss": 0.7276,
+      "step": 96
+    },
+    {
+      "epoch": 0.03104,
+      "grad_norm": 0.4335493052472172,
+      "learning_rate": 0.00019999951656283268,
+      "loss": 0.7662,
+      "step": 97
+    },
+    {
+      "epoch": 0.03136,
+      "grad_norm": 0.4439500994385527,
+      "learning_rate": 0.00019999914055668561,
+      "loss": 0.8079,
+      "step": 98
+    },
+    {
+      "epoch": 0.03168,
+      "grad_norm": 0.4443586289405873,
+      "learning_rate": 0.00019999865712090327,
+      "loss": 0.7917,
+      "step": 99
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.43339383197052117,
+      "learning_rate": 0.000199998066256005,
+      "loss": 0.7308,
+      "step": 100
+    },
+    {
+      "epoch": 0.03232,
+      "grad_norm": 0.42602679580634845,
+      "learning_rate": 0.00019999736796262564,
+      "loss": 0.7523,
+      "step": 101
+    },
+    {
+      "epoch": 0.03264,
+      "grad_norm": 0.43622610490088404,
+      "learning_rate": 0.00019999656224151528,
+      "loss": 0.8239,
+      "step": 102
+    },
+    {
+      "epoch": 0.03296,
+      "grad_norm": 0.38275105288948313,
+      "learning_rate": 0.00019999564909353962,
+      "loss": 0.7389,
+      "step": 103
+    },
+    {
+      "epoch": 0.03328,
+      "grad_norm": 0.4154825734979613,
+      "learning_rate": 0.00019999462851967952,
+      "loss": 0.7232,
+      "step": 104
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4197450959577546,
+      "learning_rate": 0.00019999350052103153,
+      "loss": 0.6809,
+      "step": 105
+    },
+    {
+      "epoch": 0.03392,
+      "grad_norm": 0.4481345587105765,
+      "learning_rate": 0.00019999226509880735,
+      "loss": 0.7461,
+      "step": 106
+    },
+    {
+      "epoch": 0.03424,
+      "grad_norm": 0.43864292562090484,
+      "learning_rate": 0.00019999092225433428,
+      "loss": 0.7475,
+      "step": 107
+    },
+    {
+      "epoch": 0.03456,
+      "grad_norm": 0.4351432662380862,
+      "learning_rate": 0.0001999894719890549,
+      "loss": 0.7112,
+      "step": 108
+    },
+    {
+      "epoch": 0.03488,
+      "grad_norm": 0.4340270376809887,
+      "learning_rate": 0.0001999879143045273,
+      "loss": 0.8173,
+      "step": 109
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.4419247595812773,
+      "learning_rate": 0.00019998624920242482,
+      "loss": 0.7257,
+      "step": 110
+    },
+    {
+      "epoch": 0.03552,
+      "grad_norm": 0.42418034402832416,
+      "learning_rate": 0.00019998447668453633,
+      "loss": 0.7403,
+      "step": 111
+    },
+    {
+      "epoch": 0.03584,
+      "grad_norm": 0.42013132890142585,
+      "learning_rate": 0.00019998259675276607,
+      "loss": 0.7273,
+      "step": 112
+    },
+    {
+      "epoch": 0.03616,
+      "grad_norm": 0.4195797152062477,
+      "learning_rate": 0.00019998060940913366,
+      "loss": 0.7359,
+      "step": 113
+    },
+    {
+      "epoch": 0.03648,
+      "grad_norm": 0.47343513465246695,
+      "learning_rate": 0.0001999785146557741,
+      "loss": 0.7648,
+      "step": 114
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4314934182495577,
+      "learning_rate": 0.0001999763124949378,
+      "loss": 0.7369,
+      "step": 115
+    },
+    {
+      "epoch": 0.03712,
+      "grad_norm": 0.4169870682876902,
+      "learning_rate": 0.00019997400292899055,
+      "loss": 0.7623,
+      "step": 116
+    },
+    {
+      "epoch": 0.03744,
+      "grad_norm": 0.42705686220444367,
+      "learning_rate": 0.00019997158596041353,
+      "loss": 0.7748,
+      "step": 117
+    },
+    {
+      "epoch": 0.03776,
+      "grad_norm": 0.43707761829477404,
+      "learning_rate": 0.00019996906159180334,
+      "loss": 0.7721,
+      "step": 118
+    },
+    {
+      "epoch": 0.03808,
+      "grad_norm": 0.4303830646358726,
+      "learning_rate": 0.00019996642982587182,
+      "loss": 0.7439,
+      "step": 119
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4697790863576078,
+      "learning_rate": 0.00019996369066544643,
+      "loss": 0.7746,
+      "step": 120
+    },
+    {
+      "epoch": 0.03872,
+      "grad_norm": 0.41907770314531756,
+      "learning_rate": 0.00019996084411346975,
+      "loss": 0.7242,
+      "step": 121
+    },
+    {
+      "epoch": 0.03904,
+      "grad_norm": 0.4231403841755841,
+      "learning_rate": 0.0001999578901729999,
+      "loss": 0.7389,
+      "step": 122
+    },
+    {
+      "epoch": 0.03936,
+      "grad_norm": 0.4420251841404513,
+      "learning_rate": 0.0001999548288472103,
+      "loss": 0.7664,
+      "step": 123
+    },
+    {
+      "epoch": 0.03968,
+      "grad_norm": 0.4132985858471485,
+      "learning_rate": 0.00019995166013938976,
+      "loss": 0.7271,
+      "step": 124
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.4186223243627318,
+      "learning_rate": 0.00019994838405294247,
+      "loss": 0.7653,
+      "step": 125
+    },
+    {
+      "epoch": 0.04032,
+      "grad_norm": 0.39363335788049236,
+      "learning_rate": 0.0001999450005913879,
+      "loss": 0.7418,
+      "step": 126
+    },
+    {
+      "epoch": 0.04064,
+      "grad_norm": 0.42970976851958553,
+      "learning_rate": 0.00019994150975836093,
+      "loss": 0.78,
+      "step": 127
+    },
+    {
+      "epoch": 0.04096,
+      "grad_norm": 0.4229964846342404,
+      "learning_rate": 0.0001999379115576118,
+      "loss": 0.7799,
+      "step": 128
+    },
+    {
+      "epoch": 0.04128,
+      "grad_norm": 0.44877061698998616,
+      "learning_rate": 0.00019993420599300602,
+      "loss": 0.8015,
+      "step": 129
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4305286517221353,
+      "learning_rate": 0.00019993039306852458,
+      "loss": 0.8373,
+      "step": 130
+    },
+    {
+      "epoch": 0.04192,
+      "grad_norm": 0.42284280674688873,
+      "learning_rate": 0.00019992647278826368,
+      "loss": 0.7611,
+      "step": 131
+    },
+    {
+      "epoch": 0.04224,
+      "grad_norm": 0.4609941178480334,
+      "learning_rate": 0.0001999224451564349,
+      "loss": 0.7681,
+      "step": 132
+    },
+    {
+      "epoch": 0.04256,
+      "grad_norm": 0.4488488848602971,
+      "learning_rate": 0.00019991831017736518,
+      "loss": 0.8339,
+      "step": 133
+    },
+    {
+      "epoch": 0.04288,
+      "grad_norm": 0.44277414607454013,
+      "learning_rate": 0.0001999140678554967,
+      "loss": 0.7883,
+      "step": 134
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.3951794719168181,
+      "learning_rate": 0.00019990971819538707,
+      "loss": 0.7472,
+      "step": 135
+    },
+    {
+      "epoch": 0.04352,
+      "grad_norm": 0.4284900147790867,
+      "learning_rate": 0.00019990526120170908,
+      "loss": 0.6998,
+      "step": 136
+    },
+    {
+      "epoch": 0.04384,
+      "grad_norm": 0.4431910180861046,
+      "learning_rate": 0.00019990069687925098,
+      "loss": 0.7936,
+      "step": 137
+    },
+    {
+      "epoch": 0.04416,
+      "grad_norm": 0.43552565116953146,
+      "learning_rate": 0.0001998960252329162,
+      "loss": 0.7357,
+      "step": 138
+    },
+    {
+      "epoch": 0.04448,
+      "grad_norm": 0.44525793652397894,
+      "learning_rate": 0.00019989124626772353,
+      "loss": 0.756,
+      "step": 139
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.43809580788943686,
+      "learning_rate": 0.00019988635998880702,
+      "loss": 0.7998,
+      "step": 140
+    },
+    {
+      "epoch": 0.04512,
+      "grad_norm": 0.4019782963026797,
+      "learning_rate": 0.00019988136640141608,
+      "loss": 0.748,
+      "step": 141
+    },
+    {
+      "epoch": 0.04544,
+      "grad_norm": 0.49782136174784086,
+      "learning_rate": 0.00019987626551091526,
+      "loss": 0.7979,
+      "step": 142
+    },
+    {
+      "epoch": 0.04576,
+      "grad_norm": 0.5296834431801735,
+      "learning_rate": 0.00019987105732278458,
+      "loss": 0.7204,
+      "step": 143
+    },
+    {
+      "epoch": 0.04608,
+      "grad_norm": 0.4176342903284688,
+      "learning_rate": 0.00019986574184261912,
+      "loss": 0.7115,
+      "step": 144
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.3998532267253694,
+      "learning_rate": 0.0001998603190761294,
+      "loss": 0.7143,
+      "step": 145
+    },
+    {
+      "epoch": 0.04672,
+      "grad_norm": 0.4326528288413225,
+      "learning_rate": 0.00019985478902914114,
+      "loss": 0.7296,
+      "step": 146
+    },
+    {
+      "epoch": 0.04704,
+      "grad_norm": 0.42619185588922404,
+      "learning_rate": 0.00019984915170759526,
+      "loss": 0.7583,
+      "step": 147
+    },
+    {
+      "epoch": 0.04736,
+      "grad_norm": 0.40430419383774363,
+      "learning_rate": 0.00019984340711754796,
+      "loss": 0.6987,
+      "step": 148
+    },
+    {
+      "epoch": 0.04768,
+      "grad_norm": 0.4467205333763215,
+      "learning_rate": 0.00019983755526517075,
+      "loss": 0.732,
+      "step": 149
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.388442786464422,
+      "learning_rate": 0.0001998315961567502,
+      "loss": 0.7369,
+      "step": 150
+    },
+    {
+      "epoch": 0.04832,
+      "grad_norm": 0.3880083864543083,
+      "learning_rate": 0.00019982552979868828,
+      "loss": 0.7397,
+      "step": 151
+    },
+    {
+      "epoch": 0.04864,
+      "grad_norm": 0.39891635644433154,
+      "learning_rate": 0.00019981935619750214,
+      "loss": 0.7184,
+      "step": 152
+    },
+    {
+      "epoch": 0.04896,
+      "grad_norm": 0.4135459316469171,
+      "learning_rate": 0.00019981307535982406,
+      "loss": 0.7289,
+      "step": 153
+    },
+    {
+      "epoch": 0.04928,
+      "grad_norm": 0.3861488505399555,
+      "learning_rate": 0.00019980668729240158,
+      "loss": 0.6985,
+      "step": 154
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.44608865752548255,
+      "learning_rate": 0.0001998001920020975,
+      "loss": 0.7259,
+      "step": 155
+    },
+    {
+      "epoch": 0.04992,
+      "grad_norm": 0.42100894479871004,
+      "learning_rate": 0.0001997935894958897,
+      "loss": 0.77,
+      "step": 156
+    },
+    {
+      "epoch": 0.05024,
+      "grad_norm": 0.4174110619045265,
+      "learning_rate": 0.00019978687978087126,
+      "loss": 0.7178,
+      "step": 157
+    },
+    {
+      "epoch": 0.05056,
+      "grad_norm": 0.4308663947925414,
+      "learning_rate": 0.0001997800628642505,
+      "loss": 0.8116,
+      "step": 158
+    },
+    {
+      "epoch": 0.05088,
+      "grad_norm": 0.4001049289467878,
+      "learning_rate": 0.0001997731387533509,
+      "loss": 0.718,
+      "step": 159
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.42079041121567273,
+      "learning_rate": 0.000199766107455611,
+      "loss": 0.7559,
+      "step": 160
+    },
+    {
+      "epoch": 0.05152,
+      "grad_norm": 0.400988421086698,
+      "learning_rate": 0.00019975896897858462,
+      "loss": 0.7585,
+      "step": 161
+    },
+    {
+      "epoch": 0.05184,
+      "grad_norm": 0.40519314433787235,
+      "learning_rate": 0.00019975172332994064,
+      "loss": 0.7746,
+      "step": 162
+    },
+    {
+      "epoch": 0.05216,
+      "grad_norm": 0.4011011567396514,
+      "learning_rate": 0.0001997443705174631,
+      "loss": 0.7249,
+      "step": 163
+    },
+    {
+      "epoch": 0.05248,
+      "grad_norm": 0.39896690231529963,
+      "learning_rate": 0.0001997369105490512,
+      "loss": 0.7207,
+      "step": 164
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.39029807861651156,
+      "learning_rate": 0.0001997293434327192,
+      "loss": 0.6764,
+      "step": 165
+    },
+    {
+      "epoch": 0.05312,
+      "grad_norm": 0.40871952633787084,
+      "learning_rate": 0.00019972166917659647,
+      "loss": 0.7614,
+      "step": 166
+    },
+    {
+      "epoch": 0.05344,
+      "grad_norm": 0.3979016647270798,
+      "learning_rate": 0.00019971388778892754,
+      "loss": 0.6654,
+      "step": 167
+    },
+    {
+      "epoch": 0.05376,
+      "grad_norm": 0.42043578133981985,
+      "learning_rate": 0.00019970599927807202,
+      "loss": 0.7452,
+      "step": 168
+    },
+    {
+      "epoch": 0.05408,
+      "grad_norm": 0.4072320901060178,
+      "learning_rate": 0.0001996980036525045,
+      "loss": 0.7072,
+      "step": 169
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3973739493689002,
+      "learning_rate": 0.0001996899009208148,
+      "loss": 0.7701,
+      "step": 170
+    },
+    {
+      "epoch": 0.05472,
+      "grad_norm": 0.39021026431850786,
+      "learning_rate": 0.00019968169109170773,
+      "loss": 0.7587,
+      "step": 171
+    },
+    {
+      "epoch": 0.05504,
+      "grad_norm": 0.412900634118538,
+      "learning_rate": 0.00019967337417400313,
+      "loss": 0.7574,
+      "step": 172
+    },
+    {
+      "epoch": 0.05536,
+      "grad_norm": 0.43102406990920983,
+      "learning_rate": 0.0001996649501766359,
+      "loss": 0.7588,
+      "step": 173
+    },
+    {
+      "epoch": 0.05568,
+      "grad_norm": 0.41930394190492015,
+      "learning_rate": 0.000199656419108656,
+      "loss": 0.7442,
+      "step": 174
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.43674689725598764,
+      "learning_rate": 0.0001996477809792284,
+      "loss": 0.7352,
+      "step": 175
+    },
+    {
+      "epoch": 0.05632,
+      "grad_norm": 0.4453725734876997,
+      "learning_rate": 0.00019963903579763313,
+      "loss": 0.7385,
+      "step": 176
+    },
+    {
+      "epoch": 0.05664,
+      "grad_norm": 0.42876817533957673,
+      "learning_rate": 0.0001996301835732651,
+      "loss": 0.7217,
+      "step": 177
+    },
+    {
+      "epoch": 0.05696,
+      "grad_norm": 0.418426365088918,
+      "learning_rate": 0.0001996212243156344,
+      "loss": 0.762,
+      "step": 178
+    },
+    {
+      "epoch": 0.05728,
+      "grad_norm": 0.4103204533243087,
+      "learning_rate": 0.00019961215803436595,
+      "loss": 0.7672,
+      "step": 179
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.38899188788309413,
+      "learning_rate": 0.00019960298473919972,
+      "loss": 0.7198,
+      "step": 180
+    },
+    {
+      "epoch": 0.05792,
+      "grad_norm": 0.42512049915701683,
+      "learning_rate": 0.00019959370443999063,
+      "loss": 0.7572,
+      "step": 181
+    },
+    {
+      "epoch": 0.05824,
+      "grad_norm": 0.39589404157816827,
+      "learning_rate": 0.00019958431714670857,
+      "loss": 0.7856,
+      "step": 182
+    },
+    {
+      "epoch": 0.05856,
+      "grad_norm": 0.4005547070868384,
+      "learning_rate": 0.00019957482286943838,
+      "loss": 0.7429,
+      "step": 183
+    },
+    {
+      "epoch": 0.05888,
+      "grad_norm": 0.4211613797439173,
+      "learning_rate": 0.00019956522161837975,
+      "loss": 0.7364,
+      "step": 184
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.39825753824106586,
+      "learning_rate": 0.00019955551340384743,
+      "loss": 0.73,
+      "step": 185
+    },
+    {
+      "epoch": 0.05952,
+      "grad_norm": 0.4033078721501204,
+      "learning_rate": 0.000199545698236271,
+      "loss": 0.7517,
+      "step": 186
+    },
+    {
+      "epoch": 0.05984,
+      "grad_norm": 0.409040308738097,
+      "learning_rate": 0.00019953577612619484,
+      "loss": 0.7586,
+      "step": 187
+    },
+    {
+      "epoch": 0.06016,
+      "grad_norm": 0.40795573538716995,
+      "learning_rate": 0.00019952574708427849,
+      "loss": 0.7151,
+      "step": 188
+    },
+    {
+      "epoch": 0.06048,
+      "grad_norm": 0.40579597124070016,
+      "learning_rate": 0.00019951561112129614,
+      "loss": 0.7847,
+      "step": 189
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.41542554720413943,
+      "learning_rate": 0.00019950536824813684,
+      "loss": 0.7643,
+      "step": 190
+    },
+    {
+      "epoch": 0.06112,
+      "grad_norm": 0.38383861271606445,
+      "learning_rate": 0.00019949501847580468,
+      "loss": 0.6891,
+      "step": 191
+    },
+    {
+      "epoch": 0.06144,
+      "grad_norm": 0.39631534097947313,
+      "learning_rate": 0.0001994845618154184,
+      "loss": 0.7638,
+      "step": 192
+    },
+    {
+      "epoch": 0.06176,
+      "grad_norm": 0.4136445289531319,
+      "learning_rate": 0.00019947399827821167,
+      "loss": 0.6831,
+      "step": 193
+    },
+    {
+      "epoch": 0.06208,
+      "grad_norm": 0.4193170159492938,
+      "learning_rate": 0.000199463327875533,
+      "loss": 0.7256,
+      "step": 194
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4293844050838592,
+      "learning_rate": 0.00019945255061884558,
+      "loss": 0.7581,
+      "step": 195
+    },
+    {
+      "epoch": 0.06272,
+      "grad_norm": 0.42058192682057277,
+      "learning_rate": 0.00019944166651972753,
+      "loss": 0.7437,
+      "step": 196
+    },
+    {
+      "epoch": 0.06304,
+      "grad_norm": 0.4290448472953912,
+      "learning_rate": 0.00019943067558987173,
+      "loss": 0.7493,
+      "step": 197
+    },
+    {
+      "epoch": 0.06336,
+      "grad_norm": 0.37354230740165073,
+      "learning_rate": 0.0001994195778410857,
+      "loss": 0.7365,
+      "step": 198
+    },
+    {
+      "epoch": 0.06368,
+      "grad_norm": 0.44989861229606387,
+      "learning_rate": 0.0001994083732852919,
+      "loss": 0.7501,
+      "step": 199
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.3999556571515187,
+      "learning_rate": 0.00019939706193452744,
+      "loss": 0.6995,
+      "step": 200
+    },
+    {
+      "epoch": 0.06432,
+      "grad_norm": 0.4906851537775836,
+      "learning_rate": 0.00019938564380094414,
+      "loss": 0.7537,
+      "step": 201
+    },
+    {
+      "epoch": 0.06464,
+      "grad_norm": 0.41015748077898057,
+      "learning_rate": 0.00019937411889680854,
+      "loss": 0.7069,
+      "step": 202
+    },
+    {
+      "epoch": 0.06496,
+      "grad_norm": 0.4336475847452223,
+      "learning_rate": 0.00019936248723450195,
+      "loss": 0.7481,
+      "step": 203
+    },
+    {
+      "epoch": 0.06528,
+      "grad_norm": 0.4055756795485628,
+      "learning_rate": 0.00019935074882652034,
+      "loss": 0.7076,
+      "step": 204
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.3932595761842593,
+      "learning_rate": 0.0001993389036854743,
+      "loss": 0.7103,
+      "step": 205
+    },
+    {
+      "epoch": 0.06592,
+      "grad_norm": 0.4390232786872728,
+      "learning_rate": 0.0001993269518240892,
+      "loss": 0.7166,
+      "step": 206
+    },
+    {
+      "epoch": 0.06624,
+      "grad_norm": 0.4096971237501023,
+      "learning_rate": 0.0001993148932552049,
+      "loss": 0.7667,
+      "step": 207
+    },
+    {
+      "epoch": 0.06656,
+      "grad_norm": 0.40859693813475584,
+      "learning_rate": 0.00019930272799177607,
+      "loss": 0.7489,
+      "step": 208
+    },
+    {
+      "epoch": 0.06688,
+      "grad_norm": 0.41221207010486643,
+      "learning_rate": 0.00019929045604687187,
+      "loss": 0.7128,
+      "step": 209
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5434540483850174,
+      "learning_rate": 0.00019927807743367611,
+      "loss": 0.7341,
+      "step": 210
+    },
+    {
+      "epoch": 0.06752,
+      "grad_norm": 0.45360060601262314,
+      "learning_rate": 0.00019926559216548728,
+      "loss": 0.7629,
+      "step": 211
+    },
+    {
+      "epoch": 0.06784,
+      "grad_norm": 0.40185826295679233,
+      "learning_rate": 0.0001992530002557183,
+      "loss": 0.7708,
+      "step": 212
+    },
+    {
+      "epoch": 0.06816,
+      "grad_norm": 0.40964848035629653,
+      "learning_rate": 0.00019924030171789676,
+      "loss": 0.745,
+      "step": 213
+    },
+    {
+      "epoch": 0.06848,
+      "grad_norm": 0.3960611256344289,
+      "learning_rate": 0.00019922749656566476,
+      "loss": 0.6921,
+      "step": 214
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4241656510189951,
+      "learning_rate": 0.00019921458481277895,
+      "loss": 0.7762,
+      "step": 215
+    },
+    {
+      "epoch": 0.06912,
+      "grad_norm": 0.387980076780542,
+      "learning_rate": 0.00019920156647311048,
+      "loss": 0.7131,
+      "step": 216
+    },
+    {
+      "epoch": 0.06944,
+      "grad_norm": 0.4054873163160114,
+      "learning_rate": 0.00019918844156064505,
+      "loss": 0.7034,
+      "step": 217
+    },
+    {
+      "epoch": 0.06976,
+      "grad_norm": 0.39539500463069105,
+      "learning_rate": 0.00019917521008948287,
+      "loss": 0.7339,
+      "step": 218
+    },
+    {
+      "epoch": 0.07008,
+      "grad_norm": 0.38885942769563575,
+      "learning_rate": 0.00019916187207383846,
+      "loss": 0.7737,
+      "step": 219
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.42177712975947823,
+      "learning_rate": 0.00019914842752804103,
+      "loss": 0.7967,
+      "step": 220
+    },
+    {
+      "epoch": 0.07072,
+      "grad_norm": 0.4210850229779522,
+      "learning_rate": 0.00019913487646653407,
+      "loss": 0.7173,
+      "step": 221
+    },
+    {
+      "epoch": 0.07104,
+      "grad_norm": 0.40742160560153656,
+      "learning_rate": 0.00019912121890387562,
+      "loss": 0.7331,
+      "step": 222
+    },
+    {
+      "epoch": 0.07136,
+      "grad_norm": 0.4191705551965322,
+      "learning_rate": 0.00019910745485473804,
+      "loss": 0.7154,
+      "step": 223
+    },
+    {
+      "epoch": 0.07168,
+      "grad_norm": 0.4233092491679564,
+      "learning_rate": 0.00019909358433390812,
+      "loss": 0.7744,
+      "step": 224
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.4193130476377568,
+      "learning_rate": 0.00019907960735628704,
+      "loss": 0.7612,
+      "step": 225
+    },
+    {
+      "epoch": 0.07232,
+      "grad_norm": 0.40321576798819103,
+      "learning_rate": 0.00019906552393689038,
+      "loss": 0.7599,
+      "step": 226
+    },
+    {
+      "epoch": 0.07264,
+      "grad_norm": 0.38047122311599635,
+      "learning_rate": 0.000199051334090848,
+      "loss": 0.7178,
+      "step": 227
+    },
+    {
+      "epoch": 0.07296,
+      "grad_norm": 0.3951723954239479,
+      "learning_rate": 0.00019903703783340413,
+      "loss": 0.7889,
+      "step": 228
+    },
+    {
+      "epoch": 0.07328,
+      "grad_norm": 0.38408564960849323,
+      "learning_rate": 0.00019902263517991732,
+      "loss": 0.7144,
+      "step": 229
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3835867778635915,
+      "learning_rate": 0.00019900812614586044,
+      "loss": 0.7334,
+      "step": 230
+    },
+    {
+      "epoch": 0.07392,
+      "grad_norm": 0.37616731257706504,
+      "learning_rate": 0.00019899351074682063,
+      "loss": 0.7208,
+      "step": 231
+    },
+    {
+      "epoch": 0.07424,
+      "grad_norm": 0.4059665060035399,
+      "learning_rate": 0.00019897878899849926,
+      "loss": 0.7589,
+      "step": 232
+    },
+    {
+      "epoch": 0.07456,
+      "grad_norm": 0.36633209765115576,
+      "learning_rate": 0.000198963960916712,
+      "loss": 0.6973,
+      "step": 233
+    },
+    {
+      "epoch": 0.07488,
+      "grad_norm": 0.3848590319578474,
+      "learning_rate": 0.00019894902651738878,
+      "loss": 0.6899,
+      "step": 234
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.37115991361388495,
+      "learning_rate": 0.00019893398581657365,
+      "loss": 0.7372,
+      "step": 235
+    },
+    {
+      "epoch": 0.07552,
+      "grad_norm": 0.38925631584518244,
+      "learning_rate": 0.00019891883883042496,
+      "loss": 0.77,
+      "step": 236
+    },
+    {
+      "epoch": 0.07584,
+      "grad_norm": 0.3921187708606024,
+      "learning_rate": 0.0001989035855752152,
+      "loss": 0.7476,
+      "step": 237
+    },
+    {
+      "epoch": 0.07616,
+      "grad_norm": 0.3856452255442644,
+      "learning_rate": 0.000198888226067331,
+      "loss": 0.7207,
+      "step": 238
+    },
+    {
+      "epoch": 0.07648,
+      "grad_norm": 0.3984872935543763,
+      "learning_rate": 0.00019887276032327318,
+      "loss": 0.736,
+      "step": 239
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.3915371543475769,
+      "learning_rate": 0.00019885718835965666,
+      "loss": 0.7146,
+      "step": 240
+    },
+    {
+      "epoch": 0.07712,
+      "grad_norm": 0.39266246316836856,
+      "learning_rate": 0.00019884151019321054,
+      "loss": 0.7087,
+      "step": 241
+    },
+    {
+      "epoch": 0.07744,
+      "grad_norm": 0.38461881795166664,
+      "learning_rate": 0.00019882572584077788,
+      "loss": 0.7825,
+      "step": 242
+    },
+    {
+      "epoch": 0.07776,
+      "grad_norm": 0.3924735719126863,
+      "learning_rate": 0.00019880983531931596,
+      "loss": 0.7353,
+      "step": 243
+    },
+    {
+      "epoch": 0.07808,
+      "grad_norm": 0.38095421708008886,
+      "learning_rate": 0.00019879383864589606,
+      "loss": 0.7371,
+      "step": 244
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.41911007880966034,
+      "learning_rate": 0.00019877773583770346,
+      "loss": 0.7486,
+      "step": 245
+    },
+    {
+      "epoch": 0.07872,
+      "grad_norm": 0.6036030468188308,
+      "learning_rate": 0.00019876152691203748,
+      "loss": 0.7411,
+      "step": 246
+    },
+    {
+      "epoch": 0.07904,
+      "grad_norm": 0.39942905648066546,
+      "learning_rate": 0.00019874521188631154,
+      "loss": 0.6852,
+      "step": 247
+    },
+    {
+      "epoch": 0.07936,
+      "grad_norm": 0.4011128910600863,
+      "learning_rate": 0.0001987287907780529,
+      "loss": 0.7182,
+      "step": 248
+    },
+    {
+      "epoch": 0.07968,
+      "grad_norm": 0.40190246584576583,
+      "learning_rate": 0.00019871226360490286,
+      "loss": 0.7275,
+      "step": 249
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.39918816572839777,
+      "learning_rate": 0.00019869563038461664,
+      "loss": 0.7246,
+      "step": 250
+    },
+    {
+      "epoch": 0.08032,
+      "grad_norm": 0.3942953012704703,
+      "learning_rate": 0.00019867889113506343,
+      "loss": 0.7171,
+      "step": 251
+    },
+    {
+      "epoch": 0.08064,
+      "grad_norm": 0.37653074922940427,
+      "learning_rate": 0.00019866204587422627,
+      "loss": 0.6838,
+      "step": 252
+    },
+    {
+      "epoch": 0.08096,
+      "grad_norm": 0.4063054496037096,
+      "learning_rate": 0.00019864509462020217,
+      "loss": 0.7884,
+      "step": 253
+    },
+    {
+      "epoch": 0.08128,
+      "grad_norm": 0.3996311018272426,
+      "learning_rate": 0.0001986280373912019,
+      "loss": 0.7356,
+      "step": 254
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.3919731619260927,
+      "learning_rate": 0.00019861087420555018,
+      "loss": 0.7207,
+      "step": 255
+    },
+    {
+      "epoch": 0.08192,
+      "grad_norm": 0.3753944432703445,
+      "learning_rate": 0.00019859360508168544,
+      "loss": 0.6757,
+      "step": 256
+    },
+    {
+      "epoch": 0.08224,
+      "grad_norm": 0.3972143798271442,
+      "learning_rate": 0.00019857623003816013,
+      "loss": 0.7128,
+      "step": 257
+    },
+    {
+      "epoch": 0.08256,
+      "grad_norm": 0.394576659186477,
+      "learning_rate": 0.00019855874909364022,
+      "loss": 0.7093,
+      "step": 258
+    },
+    {
+      "epoch": 0.08288,
+      "grad_norm": 0.4306181660524957,
+      "learning_rate": 0.00019854116226690564,
+      "loss": 0.8011,
+      "step": 259
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.43917347187513456,
+      "learning_rate": 0.00019852346957685004,
+      "loss": 0.7001,
+      "step": 260
+    },
+    {
+      "epoch": 0.08352,
+      "grad_norm": 0.38619493331680604,
+      "learning_rate": 0.00019850567104248078,
+      "loss": 0.6714,
+      "step": 261
+    },
+    {
+      "epoch": 0.08384,
+      "grad_norm": 0.39185682881131534,
+      "learning_rate": 0.00019848776668291885,
+      "loss": 0.7346,
+      "step": 262
+    },
+    {
+      "epoch": 0.08416,
+      "grad_norm": 0.3970161198449849,
+      "learning_rate": 0.0001984697565173991,
+      "loss": 0.6804,
+      "step": 263
+    },
+    {
+      "epoch": 0.08448,
+      "grad_norm": 0.40749158498554805,
+      "learning_rate": 0.00019845164056526987,
+      "loss": 0.6896,
+      "step": 264
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.387205310428941,
+      "learning_rate": 0.00019843341884599326,
+      "loss": 0.6973,
+      "step": 265
+    },
+    {
+      "epoch": 0.08512,
+      "grad_norm": 0.3987324599604142,
+      "learning_rate": 0.000198415091379145,
+      "loss": 0.6619,
+      "step": 266
+    },
+    {
+      "epoch": 0.08544,
+      "grad_norm": 0.4285974439889262,
+      "learning_rate": 0.00019839665818441432,
+      "loss": 0.7207,
+      "step": 267
+    },
+    {
+      "epoch": 0.08576,
+      "grad_norm": 0.38351120241378284,
+      "learning_rate": 0.00019837811928160418,
+      "loss": 0.7648,
+      "step": 268
+    },
+    {
+      "epoch": 0.08608,
+      "grad_norm": 0.4365965888950885,
+      "learning_rate": 0.000198359474690631,
+      "loss": 0.7123,
+      "step": 269
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.38602629914454967,
+      "learning_rate": 0.0001983407244315247,
+      "loss": 0.7121,
+      "step": 270
+    },
+    {
+      "epoch": 0.08672,
+      "grad_norm": 0.3990678215990375,
+      "learning_rate": 0.0001983218685244289,
+      "loss": 0.7182,
+      "step": 271
+    },
+    {
+      "epoch": 0.08704,
+      "grad_norm": 0.4017389951871944,
+      "learning_rate": 0.00019830290698960053,
+      "loss": 0.7595,
+      "step": 272
+    },
+    {
+      "epoch": 0.08736,
+      "grad_norm": 0.4142872969203646,
+      "learning_rate": 0.00019828383984741007,
+      "loss": 0.7476,
+      "step": 273
+    },
+    {
+      "epoch": 0.08768,
+      "grad_norm": 0.4312517387355425,
+      "learning_rate": 0.0001982646671183415,
+      "loss": 0.7136,
+      "step": 274
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.39073818316410924,
+      "learning_rate": 0.0001982453888229922,
+      "loss": 0.71,
+      "step": 275
+    },
+    {
+      "epoch": 0.08832,
+      "grad_norm": 0.40378235037568677,
+      "learning_rate": 0.0001982260049820729,
+      "loss": 0.7153,
+      "step": 276
+    },
+    {
+      "epoch": 0.08864,
+      "grad_norm": 0.41744686856989976,
+      "learning_rate": 0.00019820651561640778,
+      "loss": 0.7061,
+      "step": 277
+    },
+    {
+      "epoch": 0.08896,
+      "grad_norm": 0.44073694938968294,
+      "learning_rate": 0.00019818692074693441,
+      "loss": 0.7551,
+      "step": 278
+    },
+    {
+      "epoch": 0.08928,
+      "grad_norm": 0.41495195011598074,
+      "learning_rate": 0.00019816722039470364,
+      "loss": 0.7389,
+      "step": 279
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.3893739240546949,
+      "learning_rate": 0.00019814741458087966,
+      "loss": 0.7219,
+      "step": 280
+    },
+    {
+      "epoch": 0.08992,
+      "grad_norm": 0.39734050761821677,
+      "learning_rate": 0.00019812750332673997,
+      "loss": 0.7144,
+      "step": 281
+    },
+    {
+      "epoch": 0.09024,
+      "grad_norm": 0.3995677661543884,
+      "learning_rate": 0.00019810748665367536,
+      "loss": 0.7737,
+      "step": 282
+    },
+    {
+      "epoch": 0.09056,
+      "grad_norm": 0.38807398553849615,
+      "learning_rate": 0.00019808736458318987,
+      "loss": 0.7286,
+      "step": 283
+    },
+    {
+      "epoch": 0.09088,
+      "grad_norm": 0.3896745873389733,
+      "learning_rate": 0.00019806713713690067,
+      "loss": 0.7723,
+      "step": 284
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.39932755474156606,
+      "learning_rate": 0.0001980468043365383,
+      "loss": 0.7098,
+      "step": 285
+    },
+    {
+      "epoch": 0.09152,
+      "grad_norm": 0.4166439305222204,
+      "learning_rate": 0.0001980263662039464,
+      "loss": 0.7309,
+      "step": 286
+    },
+    {
+      "epoch": 0.09184,
+      "grad_norm": 0.43485967761182587,
+      "learning_rate": 0.00019800582276108172,
+      "loss": 0.7879,
+      "step": 287
+    },
+    {
+      "epoch": 0.09216,
+      "grad_norm": 0.4039692399472808,
+      "learning_rate": 0.00019798517403001422,
+      "loss": 0.6795,
+      "step": 288
+    },
+    {
+      "epoch": 0.09248,
+      "grad_norm": 0.4041694349935407,
+      "learning_rate": 0.00019796442003292697,
+      "loss": 0.725,
+      "step": 289
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4313839714011346,
+      "learning_rate": 0.00019794356079211604,
+      "loss": 0.7963,
+      "step": 290
+    },
+    {
+      "epoch": 0.09312,
+      "grad_norm": 0.3858259405265454,
+      "learning_rate": 0.0001979225963299907,
+      "loss": 0.733,
+      "step": 291
+    },
+    {
+      "epoch": 0.09344,
+      "grad_norm": 0.43041826493714264,
+      "learning_rate": 0.00019790152666907318,
+      "loss": 0.7296,
+      "step": 292
+    },
+    {
+      "epoch": 0.09376,
+      "grad_norm": 0.3803588873535391,
+      "learning_rate": 0.00019788035183199867,
+      "loss": 0.7085,
+      "step": 293
+    },
+    {
+      "epoch": 0.09408,
+      "grad_norm": 0.3988119900129413,
+      "learning_rate": 0.0001978590718415155,
+      "loss": 0.6845,
+      "step": 294
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.38587862906097614,
+      "learning_rate": 0.00019783768672048484,
+      "loss": 0.6867,
+      "step": 295
+    },
+    {
+      "epoch": 0.09472,
+      "grad_norm": 0.37636745195633065,
+      "learning_rate": 0.0001978161964918808,
+      "loss": 0.7104,
+      "step": 296
+    },
+    {
+      "epoch": 0.09504,
+      "grad_norm": 0.4233309449118061,
+      "learning_rate": 0.00019779460117879056,
+      "loss": 0.7115,
+      "step": 297
+    },
+    {
+      "epoch": 0.09536,
+      "grad_norm": 0.38227248134881303,
+      "learning_rate": 0.00019777290080441403,
+      "loss": 0.6601,
+      "step": 298
+    },
+    {
+      "epoch": 0.09568,
+      "grad_norm": 0.38112738163132837,
+      "learning_rate": 0.000197751095392064,
+      "loss": 0.7168,
+      "step": 299
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.40719871074341357,
+      "learning_rate": 0.00019772918496516618,
+      "loss": 0.7509,
+      "step": 300
+    },
+    {
+      "epoch": 0.09632,
+      "grad_norm": 0.3902519370404223,
+      "learning_rate": 0.0001977071695472591,
+      "loss": 0.736,
+      "step": 301
+    },
+    {
+      "epoch": 0.09664,
+      "grad_norm": 0.39785614818438686,
+      "learning_rate": 0.00019768504916199402,
+      "loss": 0.6749,
+      "step": 302
+    },
+    {
+      "epoch": 0.09696,
+      "grad_norm": 0.4337021476051533,
+      "learning_rate": 0.00019766282383313496,
+      "loss": 0.7215,
+      "step": 303
+    },
+    {
+      "epoch": 0.09728,
+      "grad_norm": 0.3994556451153538,
+      "learning_rate": 0.0001976404935845588,
+      "loss": 0.6886,
+      "step": 304
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4423135171685121,
+      "learning_rate": 0.00019761805844025493,
+      "loss": 0.7452,
+      "step": 305
+    },
+    {
+      "epoch": 0.09792,
+      "grad_norm": 0.4002539317488952,
+      "learning_rate": 0.00019759551842432567,
+      "loss": 0.6484,
+      "step": 306
+    },
+    {
+      "epoch": 0.09824,
+      "grad_norm": 0.37680569680480824,
+      "learning_rate": 0.00019757287356098578,
+      "loss": 0.7572,
+      "step": 307
+    },
+    {
+      "epoch": 0.09856,
+      "grad_norm": 0.4098670439538843,
+      "learning_rate": 0.00019755012387456287,
+      "loss": 0.7531,
+      "step": 308
+    },
+    {
+      "epoch": 0.09888,
+      "grad_norm": 0.39289540752990076,
+      "learning_rate": 0.00019752726938949695,
+      "loss": 0.7133,
+      "step": 309
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.38922217354120847,
+      "learning_rate": 0.0001975043101303408,
+      "loss": 0.732,
+      "step": 310
+    },
+    {
+      "epoch": 0.09952,
+      "grad_norm": 0.3880880034800749,
+      "learning_rate": 0.00019748124612175964,
+      "loss": 0.6843,
+      "step": 311
+    },
+    {
+      "epoch": 0.09984,
+      "grad_norm": 0.3926781777740438,
+      "learning_rate": 0.00019745807738853129,
+      "loss": 0.7181,
+      "step": 312
+    },
+    {
+      "epoch": 0.10016,
+      "grad_norm": 0.3989927994943441,
+      "learning_rate": 0.000197434803955546,
+      "loss": 0.6273,
+      "step": 313
+    },
+    {
+      "epoch": 0.10048,
+      "grad_norm": 0.41350674532685083,
+      "learning_rate": 0.00019741142584780663,
+      "loss": 0.7536,
+      "step": 314
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.39078931167545056,
+      "learning_rate": 0.00019738794309042833,
+      "loss": 0.7497,
+      "step": 315
+    },
+    {
+      "epoch": 0.10112,
+      "grad_norm": 0.40601442946106014,
+      "learning_rate": 0.00019736435570863882,
+      "loss": 0.6929,
+      "step": 316
+    },
+    {
+      "epoch": 0.10144,
+      "grad_norm": 0.3675734639441937,
+      "learning_rate": 0.00019734066372777812,
+      "loss": 0.7149,
+      "step": 317
+    },
+    {
+      "epoch": 0.10176,
+      "grad_norm": 0.40099515701906197,
+      "learning_rate": 0.00019731686717329864,
+      "loss": 0.7306,
+      "step": 318
+    },
+    {
+      "epoch": 0.10208,
+      "grad_norm": 0.36265216681507756,
+      "learning_rate": 0.0001972929660707652,
+      "loss": 0.6564,
+      "step": 319
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.392931660116471,
+      "learning_rate": 0.00019726896044585486,
+      "loss": 0.7989,
+      "step": 320
+    },
+    {
+      "epoch": 0.10272,
+      "grad_norm": 0.41333241036388846,
+      "learning_rate": 0.000197244850324357,
+      "loss": 0.7486,
+      "step": 321
+    },
+    {
+      "epoch": 0.10304,
+      "grad_norm": 0.384478374445219,
+      "learning_rate": 0.00019722063573217327,
+      "loss": 0.7425,
+      "step": 322
+    },
+    {
+      "epoch": 0.10336,
+      "grad_norm": 0.38101090140120525,
+      "learning_rate": 0.0001971963166953175,
+      "loss": 0.6949,
+      "step": 323
+    },
+    {
+      "epoch": 0.10368,
+      "grad_norm": 0.37823225285318696,
+      "learning_rate": 0.00019717189323991584,
+      "loss": 0.6881,
+      "step": 324
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.3655537604689273,
+      "learning_rate": 0.00019714736539220648,
+      "loss": 0.6682,
+      "step": 325
+    },
+    {
+      "epoch": 0.10432,
+      "grad_norm": 0.39197705762369495,
+      "learning_rate": 0.00019712273317853987,
+      "loss": 0.6997,
+      "step": 326
+    },
+    {
+      "epoch": 0.10464,
+      "grad_norm": 0.3922854893583484,
+      "learning_rate": 0.0001970979966253785,
+      "loss": 0.6911,
+      "step": 327
+    },
+    {
+      "epoch": 0.10496,
+      "grad_norm": 0.39687178433365566,
+      "learning_rate": 0.00019707315575929698,
+      "loss": 0.7163,
+      "step": 328
+    },
+    {
+      "epoch": 0.10528,
+      "grad_norm": 0.42409085617806075,
+      "learning_rate": 0.000197048210606982,
+      "loss": 0.7043,
+      "step": 329
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.43758902659978566,
+      "learning_rate": 0.00019702316119523235,
+      "loss": 0.7661,
+      "step": 330
+    },
+    {
+      "epoch": 0.10592,
+      "grad_norm": 0.38965988942002916,
+      "learning_rate": 0.00019699800755095865,
+      "loss": 0.6692,
+      "step": 331
+    },
+    {
+      "epoch": 0.10624,
+      "grad_norm": 0.41109767298007194,
+      "learning_rate": 0.00019697274970118366,
+      "loss": 0.705,
+      "step": 332
+    },
+    {
+      "epoch": 0.10656,
+      "grad_norm": 0.37753455435085426,
+      "learning_rate": 0.00019694738767304197,
+      "loss": 0.6923,
+      "step": 333
+    },
+    {
+      "epoch": 0.10688,
+      "grad_norm": 0.3648177509009567,
+      "learning_rate": 0.00019692192149378023,
+      "loss": 0.7262,
+      "step": 334
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.3916748545574145,
+      "learning_rate": 0.00019689635119075682,
+      "loss": 0.7081,
+      "step": 335
+    },
+    {
+      "epoch": 0.10752,
+      "grad_norm": 0.383792734246004,
+      "learning_rate": 0.00019687067679144212,
+      "loss": 0.6848,
+      "step": 336
+    },
+    {
+      "epoch": 0.10784,
+      "grad_norm": 0.3984452938273053,
+      "learning_rate": 0.00019684489832341826,
+      "loss": 0.713,
+      "step": 337
+    },
+    {
+      "epoch": 0.10816,
+      "grad_norm": 0.39431641919942684,
+      "learning_rate": 0.00019681901581437917,
+      "loss": 0.7119,
+      "step": 338
+    },
+    {
+      "epoch": 0.10848,
+      "grad_norm": 0.40844450591602066,
+      "learning_rate": 0.00019679302929213058,
+      "loss": 0.7191,
+      "step": 339
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.41398697679787083,
+      "learning_rate": 0.00019676693878459002,
+      "loss": 0.7771,
+      "step": 340
+    },
+    {
+      "epoch": 0.10912,
+      "grad_norm": 0.5955820097137865,
+      "learning_rate": 0.00019674074431978657,
+      "loss": 0.7569,
+      "step": 341
+    },
+    {
+      "epoch": 0.10944,
+      "grad_norm": 0.3919756720617864,
+      "learning_rate": 0.00019671444592586117,
+      "loss": 0.6646,
+      "step": 342
+    },
+    {
+      "epoch": 0.10976,
+      "grad_norm": 0.3703194327080098,
+      "learning_rate": 0.00019668804363106627,
+      "loss": 0.6757,
+      "step": 343
+    },
+    {
+      "epoch": 0.11008,
+      "grad_norm": 0.3765740441482955,
+      "learning_rate": 0.00019666153746376606,
+      "loss": 0.7035,
+      "step": 344
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.38879270566355456,
+      "learning_rate": 0.00019663492745243622,
+      "loss": 0.698,
+      "step": 345
+    },
+    {
+      "epoch": 0.11072,
+      "grad_norm": 0.40171601066504403,
+      "learning_rate": 0.00019660821362566403,
+      "loss": 0.7354,
+      "step": 346
+    },
+    {
+      "epoch": 0.11104,
+      "grad_norm": 0.37111334014738273,
+      "learning_rate": 0.00019658139601214835,
+      "loss": 0.7475,
+      "step": 347
+    },
+    {
+      "epoch": 0.11136,
+      "grad_norm": 0.3978166612221857,
+      "learning_rate": 0.00019655447464069945,
+      "loss": 0.7028,
+      "step": 348
+    },
+    {
+      "epoch": 0.11168,
+      "grad_norm": 0.3887547492967027,
+      "learning_rate": 0.00019652744954023912,
+      "loss": 0.6875,
+      "step": 349
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4111348797509523,
+      "learning_rate": 0.00019650032073980058,
+      "loss": 0.7415,
+      "step": 350
+    },
+    {
+      "epoch": 0.11232,
+      "grad_norm": 0.43309875952900573,
+      "learning_rate": 0.0001964730882685285,
+      "loss": 0.7014,
+      "step": 351
+    },
+    {
+      "epoch": 0.11264,
+      "grad_norm": 0.40540616161922716,
+      "learning_rate": 0.00019644575215567876,
+      "loss": 0.6965,
+      "step": 352
+    },
+    {
+      "epoch": 0.11296,
+      "grad_norm": 0.4096166178936405,
+      "learning_rate": 0.0001964183124306188,
+      "loss": 0.7619,
+      "step": 353
+    },
+    {
+      "epoch": 0.11328,
+      "grad_norm": 0.379456114577488,
+      "learning_rate": 0.0001963907691228272,
+      "loss": 0.7117,
+      "step": 354
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4069196300965038,
+      "learning_rate": 0.00019636312226189399,
+      "loss": 0.7495,
+      "step": 355
+    },
+    {
+      "epoch": 0.11392,
+      "grad_norm": 0.37669048703806457,
+      "learning_rate": 0.00019633537187752022,
+      "loss": 0.681,
+      "step": 356
+    },
+    {
+      "epoch": 0.11424,
+      "grad_norm": 0.39215560481927914,
+      "learning_rate": 0.00019630751799951836,
+      "loss": 0.7227,
+      "step": 357
+    },
+    {
+      "epoch": 0.11456,
+      "grad_norm": 0.3712988338345372,
+      "learning_rate": 0.000196279560657812,
+      "loss": 0.6932,
+      "step": 358
+    },
+    {
+      "epoch": 0.11488,
+      "grad_norm": 0.374590012989257,
+      "learning_rate": 0.0001962514998824358,
+      "loss": 0.679,
+      "step": 359
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.367036087727824,
+      "learning_rate": 0.00019622333570353567,
+      "loss": 0.6418,
+      "step": 360
+    },
+    {
+      "epoch": 0.11552,
+      "grad_norm": 0.38005827094282785,
+      "learning_rate": 0.00019619506815136856,
+      "loss": 0.7146,
+      "step": 361
+    },
+    {
+      "epoch": 0.11584,
+      "grad_norm": 0.3934441512232598,
+      "learning_rate": 0.00019616669725630237,
+      "loss": 0.6961,
+      "step": 362
+    },
+    {
+      "epoch": 0.11616,
+      "grad_norm": 0.38764081166938164,
+      "learning_rate": 0.0001961382230488162,
+      "loss": 0.7108,
+      "step": 363
+    },
+    {
+      "epoch": 0.11648,
+      "grad_norm": 0.3843976388365349,
+      "learning_rate": 0.00019610964555949998,
+      "loss": 0.7215,
+      "step": 364
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.37067109235427065,
+      "learning_rate": 0.0001960809648190547,
+      "loss": 0.7085,
+      "step": 365
+    },
+    {
+      "epoch": 0.11712,
+      "grad_norm": 0.3770878409066295,
+      "learning_rate": 0.00019605218085829226,
+      "loss": 0.7356,
+      "step": 366
+    },
+    {
+      "epoch": 0.11744,
+      "grad_norm": 0.3890169918260607,
+      "learning_rate": 0.00019602329370813543,
+      "loss": 0.721,
+      "step": 367
+    },
+    {
+      "epoch": 0.11776,
+      "grad_norm": 0.40592105605295065,
+      "learning_rate": 0.00019599430339961777,
+      "loss": 0.7165,
+      "step": 368
+    },
+    {
+      "epoch": 0.11808,
+      "grad_norm": 0.3965771578828951,
+      "learning_rate": 0.0001959652099638838,
+      "loss": 0.7477,
+      "step": 369
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3981457787965504,
+      "learning_rate": 0.00019593601343218873,
+      "loss": 0.6976,
+      "step": 370
+    },
+    {
+      "epoch": 0.11872,
+      "grad_norm": 0.8638849378376803,
+      "learning_rate": 0.00019590671383589857,
+      "loss": 0.6371,
+      "step": 371
+    },
+    {
+      "epoch": 0.11904,
+      "grad_norm": 0.42371243699872835,
+      "learning_rate": 0.00019587731120649006,
+      "loss": 0.732,
+      "step": 372
+    },
+    {
+      "epoch": 0.11936,
+      "grad_norm": 0.37303697851029777,
+      "learning_rate": 0.00019584780557555055,
+      "loss": 0.7099,
+      "step": 373
+    },
+    {
+      "epoch": 0.11968,
+      "grad_norm": 0.37309287181346984,
+      "learning_rate": 0.00019581819697477812,
+      "loss": 0.7065,
+      "step": 374
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.37341602065644214,
+      "learning_rate": 0.0001957884854359815,
+      "loss": 0.712,
+      "step": 375
+    },
+    {
+      "epoch": 0.12032,
+      "grad_norm": 0.3938336337340652,
+      "learning_rate": 0.00019575867099107992,
+      "loss": 0.7176,
+      "step": 376
+    },
+    {
+      "epoch": 0.12064,
+      "grad_norm": 0.37294778150394053,
+      "learning_rate": 0.00019572875367210324,
+      "loss": 0.6682,
+      "step": 377
+    },
+    {
+      "epoch": 0.12096,
+      "grad_norm": 0.42590614261212495,
+      "learning_rate": 0.00019569873351119176,
+      "loss": 0.7735,
+      "step": 378
+    },
+    {
+      "epoch": 0.12128,
+      "grad_norm": 0.39030842770338153,
+      "learning_rate": 0.00019566861054059635,
+      "loss": 0.6681,
+      "step": 379
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.38551260912476676,
+      "learning_rate": 0.00019563838479267823,
+      "loss": 0.6603,
+      "step": 380
+    },
+    {
+      "epoch": 0.12192,
+      "grad_norm": 0.3724970175408012,
+      "learning_rate": 0.00019560805629990918,
+      "loss": 0.6926,
+      "step": 381
+    },
+    {
+      "epoch": 0.12224,
+      "grad_norm": 0.3642718535051343,
+      "learning_rate": 0.00019557762509487118,
+      "loss": 0.7204,
+      "step": 382
+    },
+    {
+      "epoch": 0.12256,
+      "grad_norm": 0.42099778531197657,
+      "learning_rate": 0.00019554709121025668,
+      "loss": 0.6775,
+      "step": 383
+    },
+    {
+      "epoch": 0.12288,
+      "grad_norm": 0.3748142124825435,
+      "learning_rate": 0.00019551645467886838,
+      "loss": 0.696,
+      "step": 384
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.397445605346402,
+      "learning_rate": 0.00019548571553361935,
+      "loss": 0.6832,
+      "step": 385
+    },
+    {
+      "epoch": 0.12352,
+      "grad_norm": 0.40512924448828924,
+      "learning_rate": 0.00019545487380753272,
+      "loss": 0.7361,
+      "step": 386
+    },
+    {
+      "epoch": 0.12384,
+      "grad_norm": 0.37382306080883454,
+      "learning_rate": 0.00019542392953374199,
+      "loss": 0.6811,
+      "step": 387
+    },
+    {
+      "epoch": 0.12416,
+      "grad_norm": 0.3767244954827213,
+      "learning_rate": 0.00019539288274549076,
+      "loss": 0.6763,
+      "step": 388
+    },
+    {
+      "epoch": 0.12448,
+      "grad_norm": 0.39692856518474445,
+      "learning_rate": 0.00019536173347613276,
+      "loss": 0.7161,
+      "step": 389
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3714663251464577,
+      "learning_rate": 0.00019533048175913184,
+      "loss": 0.6755,
+      "step": 390
+    },
+    {
+      "epoch": 0.12512,
+      "grad_norm": 0.35768354200580715,
+      "learning_rate": 0.0001952991276280619,
+      "loss": 0.6706,
+      "step": 391
+    },
+    {
+      "epoch": 0.12544,
+      "grad_norm": 0.37211542233846645,
+      "learning_rate": 0.0001952676711166068,
+      "loss": 0.6699,
+      "step": 392
+    },
+    {
+      "epoch": 0.12576,
+      "grad_norm": 0.4191728240750743,
+      "learning_rate": 0.00019523611225856052,
+      "loss": 0.7522,
+      "step": 393
+    },
+    {
+      "epoch": 0.12608,
+      "grad_norm": 0.45781307337599053,
+      "learning_rate": 0.00019520445108782685,
+      "loss": 0.6705,
+      "step": 394
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.43145682281704706,
+      "learning_rate": 0.00019517268763841962,
+      "loss": 0.7601,
+      "step": 395
+    },
+    {
+      "epoch": 0.12672,
+      "grad_norm": 0.39568012091223576,
+      "learning_rate": 0.00019514082194446245,
+      "loss": 0.7347,
+      "step": 396
+    },
+    {
+      "epoch": 0.12704,
+      "grad_norm": 0.37485006940824006,
+      "learning_rate": 0.00019510885404018887,
+      "loss": 0.7333,
+      "step": 397
+    },
+    {
+      "epoch": 0.12736,
+      "grad_norm": 0.3839258549590152,
+      "learning_rate": 0.0001950767839599421,
+      "loss": 0.6951,
+      "step": 398
+    },
+    {
+      "epoch": 0.12768,
+      "grad_norm": 0.36938585152920694,
+      "learning_rate": 0.00019504461173817532,
+      "loss": 0.7345,
+      "step": 399
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3705899678112069,
+      "learning_rate": 0.0001950123374094512,
+      "loss": 0.697,
+      "step": 400
+    },
+    {
+      "epoch": 0.12832,
+      "grad_norm": 0.36234883123715833,
+      "learning_rate": 0.00019497996100844233,
+      "loss": 0.7085,
+      "step": 401
+    },
+    {
+      "epoch": 0.12864,
+      "grad_norm": 0.34592574496296163,
+      "learning_rate": 0.0001949474825699308,
+      "loss": 0.589,
+      "step": 402
+    },
+    {
+      "epoch": 0.12896,
+      "grad_norm": 0.37931984998800367,
+      "learning_rate": 0.00019491490212880842,
+      "loss": 0.7359,
+      "step": 403
+    },
+    {
+      "epoch": 0.12928,
+      "grad_norm": 0.40686094077555585,
+      "learning_rate": 0.00019488221972007653,
+      "loss": 0.7494,
+      "step": 404
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.3663956264428909,
+      "learning_rate": 0.000194849435378846,
+      "loss": 0.711,
+      "step": 405
+    },
+    {
+      "epoch": 0.12992,
+      "grad_norm": 0.3782423164452965,
+      "learning_rate": 0.00019481654914033723,
+      "loss": 0.6695,
+      "step": 406
+    },
+    {
+      "epoch": 0.13024,
+      "grad_norm": 0.38836255872524833,
+      "learning_rate": 0.00019478356103988013,
+      "loss": 0.77,
+      "step": 407
+    },
+    {
+      "epoch": 0.13056,
+      "grad_norm": 0.38528257338844074,
+      "learning_rate": 0.00019475047111291397,
+      "loss": 0.7114,
+      "step": 408
+    },
+    {
+      "epoch": 0.13088,
+      "grad_norm": 0.39360001304381337,
+      "learning_rate": 0.00019471727939498744,
+      "loss": 0.6888,
+      "step": 409
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.36464041211415976,
+      "learning_rate": 0.00019468398592175861,
+      "loss": 0.7163,
+      "step": 410
+    },
+    {
+      "epoch": 0.13152,
+      "grad_norm": 0.40454548738611545,
+      "learning_rate": 0.00019465059072899484,
+      "loss": 0.6832,
+      "step": 411
+    },
+    {
+      "epoch": 0.13184,
+      "grad_norm": 0.38499407028274757,
+      "learning_rate": 0.00019461709385257275,
+      "loss": 0.7236,
+      "step": 412
+    },
+    {
+      "epoch": 0.13216,
+      "grad_norm": 0.3816000803054317,
+      "learning_rate": 0.00019458349532847823,
+      "loss": 0.6653,
+      "step": 413
+    },
+    {
+      "epoch": 0.13248,
+      "grad_norm": 0.3733530215542171,
+      "learning_rate": 0.0001945497951928064,
+      "loss": 0.685,
+      "step": 414
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.36736445370040927,
+      "learning_rate": 0.00019451599348176143,
+      "loss": 0.6857,
+      "step": 415
+    },
+    {
+      "epoch": 0.13312,
+      "grad_norm": 0.3833224303268615,
+      "learning_rate": 0.00019448209023165675,
+      "loss": 0.6891,
+      "step": 416
+    },
+    {
+      "epoch": 0.13344,
+      "grad_norm": 0.37795485485322516,
+      "learning_rate": 0.0001944480854789148,
+      "loss": 0.6779,
+      "step": 417
+    },
+    {
+      "epoch": 0.13376,
+      "grad_norm": 0.4025319869991608,
+      "learning_rate": 0.00019441397926006705,
+      "loss": 0.7184,
+      "step": 418
+    },
+    {
+      "epoch": 0.13408,
+      "grad_norm": 0.3768627111149694,
+      "learning_rate": 0.00019437977161175401,
+      "loss": 0.6517,
+      "step": 419
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.39395613081756176,
+      "learning_rate": 0.00019434546257072517,
+      "loss": 0.7245,
+      "step": 420
+    },
+    {
+      "epoch": 0.13472,
+      "grad_norm": 0.37531513939740224,
+      "learning_rate": 0.0001943110521738389,
+      "loss": 0.6626,
+      "step": 421
+    },
+    {
+      "epoch": 0.13504,
+      "grad_norm": 0.3921708037929644,
+      "learning_rate": 0.0001942765404580625,
+      "loss": 0.7297,
+      "step": 422
+    },
+    {
+      "epoch": 0.13536,
+      "grad_norm": 0.37327316009240097,
+      "learning_rate": 0.00019424192746047208,
+      "loss": 0.7166,
+      "step": 423
+    },
+    {
+      "epoch": 0.13568,
+      "grad_norm": 0.37761914448685563,
+      "learning_rate": 0.0001942072132182526,
+      "loss": 0.7518,
+      "step": 424
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.36188829160240127,
+      "learning_rate": 0.00019417239776869772,
+      "loss": 0.6891,
+      "step": 425
+    },
+    {
+      "epoch": 0.13632,
+      "grad_norm": 0.39134382392947276,
+      "learning_rate": 0.0001941374811492099,
+      "loss": 0.6923,
+      "step": 426
+    },
+    {
+      "epoch": 0.13664,
+      "grad_norm": 0.37912454753106717,
+      "learning_rate": 0.00019410246339730033,
+      "loss": 0.7185,
+      "step": 427
+    },
+    {
+      "epoch": 0.13696,
+      "grad_norm": 0.36769610925856816,
+      "learning_rate": 0.00019406734455058863,
+      "loss": 0.6443,
+      "step": 428
+    },
+    {
+      "epoch": 0.13728,
+      "grad_norm": 0.3634857691225331,
+      "learning_rate": 0.00019403212464680328,
+      "loss": 0.7168,
+      "step": 429
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.41051051790070753,
+      "learning_rate": 0.0001939968037237812,
+      "loss": 0.7195,
+      "step": 430
+    },
+    {
+      "epoch": 0.13792,
+      "grad_norm": 0.3919784059436654,
+      "learning_rate": 0.00019396138181946784,
+      "loss": 0.7431,
+      "step": 431
+    },
+    {
+      "epoch": 0.13824,
+      "grad_norm": 0.38074687066455964,
+      "learning_rate": 0.00019392585897191715,
+      "loss": 0.7228,
+      "step": 432
+    },
+    {
+      "epoch": 0.13856,
+      "grad_norm": 0.3910341618373511,
+      "learning_rate": 0.00019389023521929156,
+      "loss": 0.6545,
+      "step": 433
+    },
+    {
+      "epoch": 0.13888,
+      "grad_norm": 0.36586330213672674,
+      "learning_rate": 0.0001938545105998618,
+      "loss": 0.6702,
+      "step": 434
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.3961879535607793,
+      "learning_rate": 0.00019381868515200705,
+      "loss": 0.6999,
+      "step": 435
+    },
+    {
+      "epoch": 0.13952,
+      "grad_norm": 0.3815459570578981,
+      "learning_rate": 0.00019378275891421485,
+      "loss": 0.7007,
+      "step": 436
+    },
+    {
+      "epoch": 0.13984,
+      "grad_norm": 0.38614703987913424,
+      "learning_rate": 0.00019374673192508088,
+      "loss": 0.6883,
+      "step": 437
+    },
+    {
+      "epoch": 0.14016,
+      "grad_norm": 0.3768851582559416,
+      "learning_rate": 0.00019371060422330918,
+      "loss": 0.7067,
+      "step": 438
+    },
+    {
+      "epoch": 0.14048,
+      "grad_norm": 0.4007619168472735,
+      "learning_rate": 0.00019367437584771188,
+      "loss": 0.7185,
+      "step": 439
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3759299339196291,
+      "learning_rate": 0.00019363804683720942,
+      "loss": 0.6766,
+      "step": 440
+    },
+    {
+      "epoch": 0.14112,
+      "grad_norm": 0.3790574386908885,
+      "learning_rate": 0.0001936016172308302,
+      "loss": 0.6759,
+      "step": 441
+    },
+    {
+      "epoch": 0.14144,
+      "grad_norm": 0.4018142631283464,
+      "learning_rate": 0.00019356508706771077,
+      "loss": 0.7008,
+      "step": 442
+    },
+    {
+      "epoch": 0.14176,
+      "grad_norm": 0.3715104629374323,
+      "learning_rate": 0.0001935284563870957,
+      "loss": 0.6847,
+      "step": 443
+    },
+    {
+      "epoch": 0.14208,
+      "grad_norm": 0.3915693736583732,
+      "learning_rate": 0.00019349172522833746,
+      "loss": 0.6917,
+      "step": 444
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.3922212702326427,
+      "learning_rate": 0.00019345489363089665,
+      "loss": 0.7113,
+      "step": 445
+    },
+    {
+      "epoch": 0.14272,
+      "grad_norm": 0.3511858399340752,
+      "learning_rate": 0.00019341796163434158,
+      "loss": 0.645,
+      "step": 446
+    },
+    {
+      "epoch": 0.14304,
+      "grad_norm": 0.3772806512240585,
+      "learning_rate": 0.00019338092927834855,
+      "loss": 0.652,
+      "step": 447
+    },
+    {
+      "epoch": 0.14336,
+      "grad_norm": 0.37316128053122677,
+      "learning_rate": 0.00019334379660270156,
+      "loss": 0.7183,
+      "step": 448
+    },
+    {
+      "epoch": 0.14368,
+      "grad_norm": 0.3706915047949919,
+      "learning_rate": 0.00019330656364729252,
+      "loss": 0.7094,
+      "step": 449
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.36577702005688933,
+      "learning_rate": 0.00019326923045212096,
+      "loss": 0.6746,
+      "step": 450
+    },
+    {
+      "epoch": 0.14432,
+      "grad_norm": 0.4258226313588293,
+      "learning_rate": 0.0001932317970572942,
+      "loss": 0.7136,
+      "step": 451
+    },
+    {
+      "epoch": 0.14464,
+      "grad_norm": 0.339175107622932,
+      "learning_rate": 0.00019319426350302706,
+      "loss": 0.6854,
+      "step": 452
+    },
+    {
+      "epoch": 0.14496,
+      "grad_norm": 0.3643605592427322,
+      "learning_rate": 0.00019315662982964207,
+      "loss": 0.7035,
+      "step": 453
+    },
+    {
+      "epoch": 0.14528,
+      "grad_norm": 0.37048444487208876,
+      "learning_rate": 0.00019311889607756934,
+      "loss": 0.7052,
+      "step": 454
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.3878958949320137,
+      "learning_rate": 0.00019308106228734643,
+      "loss": 0.7187,
+      "step": 455
+    },
+    {
+      "epoch": 0.14592,
+      "grad_norm": 0.3757245798801654,
+      "learning_rate": 0.00019304312849961836,
+      "loss": 0.6823,
+      "step": 456
+    },
+    {
+      "epoch": 0.14624,
+      "grad_norm": 0.35536761510980036,
+      "learning_rate": 0.00019300509475513765,
+      "loss": 0.6889,
+      "step": 457
+    },
+    {
+      "epoch": 0.14656,
+      "grad_norm": 0.3949603453634783,
+      "learning_rate": 0.00019296696109476417,
+      "loss": 0.7278,
+      "step": 458
+    },
+    {
+      "epoch": 0.14688,
+      "grad_norm": 0.3964081864025015,
+      "learning_rate": 0.00019292872755946507,
+      "loss": 0.6649,
+      "step": 459
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.36046615405584714,
+      "learning_rate": 0.00019289039419031492,
+      "loss": 0.6674,
+      "step": 460
+    },
+    {
+      "epoch": 0.14752,
+      "grad_norm": 0.3658347620257223,
+      "learning_rate": 0.00019285196102849543,
+      "loss": 0.646,
+      "step": 461
+    },
+    {
+      "epoch": 0.14784,
+      "grad_norm": 0.37838847219314875,
+      "learning_rate": 0.00019281342811529556,
+      "loss": 0.6343,
+      "step": 462
+    },
+    {
+      "epoch": 0.14816,
+      "grad_norm": 0.39487930517644987,
+      "learning_rate": 0.00019277479549211144,
+      "loss": 0.6999,
+      "step": 463
+    },
+    {
+      "epoch": 0.14848,
+      "grad_norm": 0.3834375094430825,
+      "learning_rate": 0.00019273606320044628,
+      "loss": 0.6894,
+      "step": 464
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.3884922732476337,
+      "learning_rate": 0.00019269723128191048,
+      "loss": 0.7034,
+      "step": 465
+    },
+    {
+      "epoch": 0.14912,
+      "grad_norm": 0.47656592826429717,
+      "learning_rate": 0.00019265829977822133,
+      "loss": 0.7038,
+      "step": 466
+    },
+    {
+      "epoch": 0.14944,
+      "grad_norm": 0.368130993256656,
+      "learning_rate": 0.00019261926873120316,
+      "loss": 0.7235,
+      "step": 467
+    },
+    {
+      "epoch": 0.14976,
+      "grad_norm": 0.377347329743766,
+      "learning_rate": 0.00019258013818278726,
+      "loss": 0.7577,
+      "step": 468
+    },
+    {
+      "epoch": 0.15008,
+      "grad_norm": 0.3567250409471538,
+      "learning_rate": 0.0001925409081750118,
+      "loss": 0.6744,
+      "step": 469
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3689876186039553,
+      "learning_rate": 0.00019250157875002176,
+      "loss": 0.6927,
+      "step": 470
+    },
+    {
+      "epoch": 0.15072,
+      "grad_norm": 0.38071445186778236,
+      "learning_rate": 0.000192462149950069,
+      "loss": 0.6704,
+      "step": 471
+    },
+    {
+      "epoch": 0.15104,
+      "grad_norm": 0.3582514404328002,
+      "learning_rate": 0.00019242262181751207,
+      "loss": 0.6678,
+      "step": 472
+    },
+    {
+      "epoch": 0.15136,
+      "grad_norm": 0.379749312867091,
+      "learning_rate": 0.00019238299439481633,
+      "loss": 0.6639,
+      "step": 473
+    },
+    {
+      "epoch": 0.15168,
+      "grad_norm": 0.36718418254493995,
+      "learning_rate": 0.00019234326772455364,
+      "loss": 0.7316,
+      "step": 474
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.37382296540011056,
+      "learning_rate": 0.00019230344184940267,
+      "loss": 0.6704,
+      "step": 475
+    },
+    {
+      "epoch": 0.15232,
+      "grad_norm": 0.374160371229486,
+      "learning_rate": 0.00019226351681214855,
+      "loss": 0.6644,
+      "step": 476
+    },
+    {
+      "epoch": 0.15264,
+      "grad_norm": 0.38323432553888526,
+      "learning_rate": 0.00019222349265568292,
+      "loss": 0.6906,
+      "step": 477
+    },
+    {
+      "epoch": 0.15296,
+      "grad_norm": 0.3703821792666677,
+      "learning_rate": 0.000192183369423004,
+      "loss": 0.6516,
+      "step": 478
+    },
+    {
+      "epoch": 0.15328,
+      "grad_norm": 0.3988791062666491,
+      "learning_rate": 0.00019214314715721646,
+      "loss": 0.6673,
+      "step": 479
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.37737714726947547,
+      "learning_rate": 0.0001921028259015312,
+      "loss": 0.6981,
+      "step": 480
+    },
+    {
+      "epoch": 0.15392,
+      "grad_norm": 0.4074991460109341,
+      "learning_rate": 0.00019206240569926566,
+      "loss": 0.7292,
+      "step": 481
+    },
+    {
+      "epoch": 0.15424,
+      "grad_norm": 0.3766169697072659,
+      "learning_rate": 0.00019202188659384344,
+      "loss": 0.6762,
+      "step": 482
+    },
+    {
+      "epoch": 0.15456,
+      "grad_norm": 0.3868353365562728,
+      "learning_rate": 0.00019198126862879442,
+      "loss": 0.6593,
+      "step": 483
+    },
+    {
+      "epoch": 0.15488,
+      "grad_norm": 0.39493266875324456,
+      "learning_rate": 0.00019194055184775476,
+      "loss": 0.7303,
+      "step": 484
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.375576416682949,
+      "learning_rate": 0.00019189973629446668,
+      "loss": 0.7138,
+      "step": 485
+    },
+    {
+      "epoch": 0.15552,
+      "grad_norm": 0.36822796412355435,
+      "learning_rate": 0.0001918588220127786,
+      "loss": 0.6774,
+      "step": 486
+    },
+    {
+      "epoch": 0.15584,
+      "grad_norm": 0.37519671549304895,
+      "learning_rate": 0.00019181780904664497,
+      "loss": 0.6485,
+      "step": 487
+    },
+    {
+      "epoch": 0.15616,
+      "grad_norm": 0.3903561740445914,
+      "learning_rate": 0.00019177669744012616,
+      "loss": 0.7331,
+      "step": 488
+    },
+    {
+      "epoch": 0.15648,
+      "grad_norm": 0.39679592296013205,
+      "learning_rate": 0.0001917354872373887,
+      "loss": 0.7154,
+      "step": 489
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.3966690850434934,
+      "learning_rate": 0.0001916941784827049,
+      "loss": 0.708,
+      "step": 490
+    },
+    {
+      "epoch": 0.15712,
+      "grad_norm": 0.35572597980615833,
+      "learning_rate": 0.00019165277122045292,
+      "loss": 0.6958,
+      "step": 491
+    },
+    {
+      "epoch": 0.15744,
+      "grad_norm": 0.3684989298750791,
+      "learning_rate": 0.00019161126549511695,
+      "loss": 0.7078,
+      "step": 492
+    },
+    {
+      "epoch": 0.15776,
+      "grad_norm": 0.37494314522046396,
+      "learning_rate": 0.0001915696613512867,
+      "loss": 0.6826,
+      "step": 493
+    },
+    {
+      "epoch": 0.15808,
+      "grad_norm": 0.36800431369769,
+      "learning_rate": 0.00019152795883365783,
+      "loss": 0.7043,
+      "step": 494
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.3645216670638875,
+      "learning_rate": 0.00019148615798703146,
+      "loss": 0.6872,
+      "step": 495
+    },
+    {
+      "epoch": 0.15872,
+      "grad_norm": 0.38843250403631,
+      "learning_rate": 0.00019144425885631464,
+      "loss": 0.6895,
+      "step": 496
+    },
+    {
+      "epoch": 0.15904,
+      "grad_norm": 0.3671826558394155,
+      "learning_rate": 0.00019140226148651971,
+      "loss": 0.675,
+      "step": 497
+    },
+    {
+      "epoch": 0.15936,
+      "grad_norm": 0.38980079974319665,
+      "learning_rate": 0.00019136016592276477,
+      "loss": 0.7428,
+      "step": 498
+    },
+    {
+      "epoch": 0.15968,
+      "grad_norm": 0.3847898120536561,
+      "learning_rate": 0.0001913179722102732,
+      "loss": 0.6488,
+      "step": 499
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3815823017313607,
+      "learning_rate": 0.00019127568039437406,
+      "loss": 0.7112,
+      "step": 500
+    },
+    {
+      "epoch": 0.16032,
+      "grad_norm": 0.36283403121938096,
+      "learning_rate": 0.00019123329052050166,
+      "loss": 0.7008,
+      "step": 501
+    },
+    {
+      "epoch": 0.16064,
+      "grad_norm": 0.38139190035074727,
+      "learning_rate": 0.0001911908026341956,
+      "loss": 0.7263,
+      "step": 502
+    },
+    {
+      "epoch": 0.16096,
+      "grad_norm": 0.3852884330965596,
+      "learning_rate": 0.00019114821678110094,
+      "loss": 0.7231,
+      "step": 503
+    },
+    {
+      "epoch": 0.16128,
+      "grad_norm": 0.35468670230811244,
+      "learning_rate": 0.00019110553300696786,
+      "loss": 0.6469,
+      "step": 504
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.3916841680909505,
+      "learning_rate": 0.0001910627513576518,
+      "loss": 0.6672,
+      "step": 505
+    },
+    {
+      "epoch": 0.16192,
+      "grad_norm": 0.3923617194910343,
+      "learning_rate": 0.0001910198718791133,
+      "loss": 0.713,
+      "step": 506
+    },
+    {
+      "epoch": 0.16224,
+      "grad_norm": 0.3854529589005894,
+      "learning_rate": 0.00019097689461741802,
+      "loss": 0.7133,
+      "step": 507
+    },
+    {
+      "epoch": 0.16256,
+      "grad_norm": 0.3847808905280083,
+      "learning_rate": 0.00019093381961873671,
+      "loss": 0.693,
+      "step": 508
+    },
+    {
+      "epoch": 0.16288,
+      "grad_norm": 0.43780163447968024,
+      "learning_rate": 0.00019089064692934507,
+      "loss": 0.7216,
+      "step": 509
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.38998446331237996,
+      "learning_rate": 0.0001908473765956237,
+      "loss": 0.6999,
+      "step": 510
+    },
+    {
+      "epoch": 0.16352,
+      "grad_norm": 0.3861633732692842,
+      "learning_rate": 0.00019080400866405825,
+      "loss": 0.6912,
+      "step": 511
+    },
+    {
+      "epoch": 0.16384,
+      "grad_norm": 0.3561765068441513,
+      "learning_rate": 0.0001907605431812391,
+      "loss": 0.6302,
+      "step": 512
+    },
+    {
+      "epoch": 0.16416,
+      "grad_norm": 0.4026059651880242,
+      "learning_rate": 0.00019071698019386144,
+      "loss": 0.6915,
+      "step": 513
+    },
+    {
+      "epoch": 0.16448,
+      "grad_norm": 0.39333405385172815,
+      "learning_rate": 0.00019067331974872525,
+      "loss": 0.7218,
+      "step": 514
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.41295627909129,
+      "learning_rate": 0.0001906295618927352,
+      "loss": 0.6675,
+      "step": 515
+    },
+    {
+      "epoch": 0.16512,
+      "grad_norm": 0.3864243276726692,
+      "learning_rate": 0.00019058570667290051,
+      "loss": 0.7098,
+      "step": 516
+    },
+    {
+      "epoch": 0.16544,
+      "grad_norm": 0.4197640644035024,
+      "learning_rate": 0.00019054175413633524,
+      "loss": 0.7277,
+      "step": 517
+    },
+    {
+      "epoch": 0.16576,
+      "grad_norm": 0.38306491913821406,
+      "learning_rate": 0.00019049770433025772,
+      "loss": 0.6582,
+      "step": 518
+    },
+    {
+      "epoch": 0.16608,
+      "grad_norm": 0.37746987041373586,
+      "learning_rate": 0.00019045355730199097,
+      "loss": 0.731,
+      "step": 519
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.36792208957798095,
+      "learning_rate": 0.00019040931309896236,
+      "loss": 0.6683,
+      "step": 520
+    },
+    {
+      "epoch": 0.16672,
+      "grad_norm": 0.361020649600021,
+      "learning_rate": 0.0001903649717687037,
+      "loss": 0.7403,
+      "step": 521
+    },
+    {
+      "epoch": 0.16704,
+      "grad_norm": 0.4190848206181769,
+      "learning_rate": 0.00019032053335885112,
+      "loss": 0.6752,
+      "step": 522
+    },
+    {
+      "epoch": 0.16736,
+      "grad_norm": 0.3610270293977483,
+      "learning_rate": 0.00019027599791714503,
+      "loss": 0.6831,
+      "step": 523
+    },
+    {
+      "epoch": 0.16768,
+      "grad_norm": 0.4101385097947774,
+      "learning_rate": 0.00019023136549143016,
+      "loss": 0.7375,
+      "step": 524
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.3959746087881117,
+      "learning_rate": 0.00019018663612965534,
+      "loss": 0.6642,
+      "step": 525
+    },
+    {
+      "epoch": 0.16832,
+      "grad_norm": 0.36808147170037214,
+      "learning_rate": 0.00019014180987987357,
+      "loss": 0.6343,
+      "step": 526
+    },
+    {
+      "epoch": 0.16864,
+      "grad_norm": 0.3561230421496128,
+      "learning_rate": 0.0001900968867902419,
+      "loss": 0.6437,
+      "step": 527
+    },
+    {
+      "epoch": 0.16896,
+      "grad_norm": 0.3621727179859261,
+      "learning_rate": 0.00019005186690902157,
+      "loss": 0.6547,
+      "step": 528
+    },
+    {
+      "epoch": 0.16928,
+      "grad_norm": 0.38771305701415426,
+      "learning_rate": 0.00019000675028457757,
+      "loss": 0.6949,
+      "step": 529
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.37576856789118235,
+      "learning_rate": 0.00018996153696537903,
+      "loss": 0.6691,
+      "step": 530
+    },
+    {
+      "epoch": 0.16992,
+      "grad_norm": 0.42470508485120706,
+      "learning_rate": 0.00018991622699999884,
+      "loss": 0.676,
+      "step": 531
+    },
+    {
+      "epoch": 0.17024,
+      "grad_norm": 0.403438500284725,
+      "learning_rate": 0.0001898708204371137,
+      "loss": 0.6939,
+      "step": 532
+    },
+    {
+      "epoch": 0.17056,
+      "grad_norm": 0.38949096830920715,
+      "learning_rate": 0.0001898253173255042,
+      "loss": 0.7248,
+      "step": 533
+    },
+    {
+      "epoch": 0.17088,
+      "grad_norm": 0.3930678925892035,
+      "learning_rate": 0.00018977971771405453,
+      "loss": 0.7135,
+      "step": 534
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.37157515098936317,
+      "learning_rate": 0.00018973402165175268,
+      "loss": 0.6971,
+      "step": 535
+    },
+    {
+      "epoch": 0.17152,
+      "grad_norm": 0.36635220243071737,
+      "learning_rate": 0.00018968822918769012,
+      "loss": 0.7514,
+      "step": 536
+    },
+    {
+      "epoch": 0.17184,
+      "grad_norm": 0.3891934321186216,
+      "learning_rate": 0.00018964234037106202,
+      "loss": 0.6485,
+      "step": 537
+    },
+    {
+      "epoch": 0.17216,
+      "grad_norm": 0.35785957346908515,
+      "learning_rate": 0.0001895963552511669,
+      "loss": 0.6761,
+      "step": 538
+    },
+    {
+      "epoch": 0.17248,
+      "grad_norm": 0.3766558157041046,
+      "learning_rate": 0.00018955027387740692,
+      "loss": 0.6772,
+      "step": 539
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.37474470871910476,
+      "learning_rate": 0.00018950409629928748,
+      "loss": 0.7119,
+      "step": 540
+    },
+    {
+      "epoch": 0.17312,
+      "grad_norm": 0.374195870224283,
+      "learning_rate": 0.00018945782256641746,
+      "loss": 0.6774,
+      "step": 541
+    },
+    {
+      "epoch": 0.17344,
+      "grad_norm": 0.39037509683718274,
+      "learning_rate": 0.00018941145272850899,
+      "loss": 0.712,
+      "step": 542
+    },
+    {
+      "epoch": 0.17376,
+      "grad_norm": 0.37084603389527393,
+      "learning_rate": 0.0001893649868353774,
+      "loss": 0.6646,
+      "step": 543
+    },
+    {
+      "epoch": 0.17408,
+      "grad_norm": 0.3667470241932476,
+      "learning_rate": 0.00018931842493694135,
+      "loss": 0.7298,
+      "step": 544
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.3736990472485872,
+      "learning_rate": 0.00018927176708322243,
+      "loss": 0.6659,
+      "step": 545
+    },
+    {
+      "epoch": 0.17472,
+      "grad_norm": 0.38070538068360926,
+      "learning_rate": 0.0001892250133243455,
+      "loss": 0.6895,
+      "step": 546
+    },
+    {
+      "epoch": 0.17504,
+      "grad_norm": 0.38498718865211484,
+      "learning_rate": 0.0001891781637105384,
+      "loss": 0.739,
+      "step": 547
+    },
+    {
+      "epoch": 0.17536,
+      "grad_norm": 0.3891829575900404,
+      "learning_rate": 0.00018913121829213186,
+      "loss": 0.7068,
+      "step": 548
+    },
+    {
+      "epoch": 0.17568,
+      "grad_norm": 0.38228959084028435,
+      "learning_rate": 0.00018908417711955972,
+      "loss": 0.7266,
+      "step": 549
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3686057452451368,
+      "learning_rate": 0.00018903704024335848,
+      "loss": 0.689,
+      "step": 550
+    },
+    {
+      "epoch": 0.17632,
+      "grad_norm": 0.3776215415512684,
+      "learning_rate": 0.00018898980771416755,
+      "loss": 0.7439,
+      "step": 551
+    },
+    {
+      "epoch": 0.17664,
+      "grad_norm": 0.3625188691744574,
+      "learning_rate": 0.00018894247958272916,
+      "loss": 0.6908,
+      "step": 552
+    },
+    {
+      "epoch": 0.17696,
+      "grad_norm": 0.4268217105640349,
+      "learning_rate": 0.00018889505589988814,
+      "loss": 0.6962,
+      "step": 553
+    },
+    {
+      "epoch": 0.17728,
+      "grad_norm": 0.4026366543037032,
+      "learning_rate": 0.000188847536716592,
+      "loss": 0.7042,
+      "step": 554
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.3689913377626984,
+      "learning_rate": 0.00018879992208389092,
+      "loss": 0.6568,
+      "step": 555
+    },
+    {
+      "epoch": 0.17792,
+      "grad_norm": 0.4156387132204756,
+      "learning_rate": 0.00018875221205293756,
+      "loss": 0.688,
+      "step": 556
+    },
+    {
+      "epoch": 0.17824,
+      "grad_norm": 0.36971848701890186,
+      "learning_rate": 0.00018870440667498702,
+      "loss": 0.6952,
+      "step": 557
+    },
+    {
+      "epoch": 0.17856,
+      "grad_norm": 0.3523042166866371,
+      "learning_rate": 0.00018865650600139694,
+      "loss": 0.6254,
+      "step": 558
+    },
+    {
+      "epoch": 0.17888,
+      "grad_norm": 0.3630790996315484,
+      "learning_rate": 0.00018860851008362724,
+      "loss": 0.737,
+      "step": 559
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.38745311865216464,
+      "learning_rate": 0.0001885604189732402,
+      "loss": 0.7309,
+      "step": 560
+    },
+    {
+      "epoch": 0.17952,
+      "grad_norm": 0.38121925471868995,
+      "learning_rate": 0.00018851223272190043,
+      "loss": 0.6639,
+      "step": 561
+    },
+    {
+      "epoch": 0.17984,
+      "grad_norm": 0.3853261193292461,
+      "learning_rate": 0.00018846395138137466,
+      "loss": 0.7131,
+      "step": 562
+    },
+    {
+      "epoch": 0.18016,
+      "grad_norm": 0.36237384884975193,
+      "learning_rate": 0.00018841557500353176,
+      "loss": 0.6762,
+      "step": 563
+    },
+    {
+      "epoch": 0.18048,
+      "grad_norm": 0.3858707335243159,
+      "learning_rate": 0.00018836710364034275,
+      "loss": 0.6952,
+      "step": 564
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.37621457315239776,
+      "learning_rate": 0.00018831853734388077,
+      "loss": 0.7272,
+      "step": 565
+    },
+    {
+      "epoch": 0.18112,
+      "grad_norm": 0.3842489228267907,
+      "learning_rate": 0.00018826987616632078,
+      "loss": 0.7303,
+      "step": 566
+    },
+    {
+      "epoch": 0.18144,
+      "grad_norm": 0.36767466702570084,
+      "learning_rate": 0.00018822112015993975,
+      "loss": 0.7073,
+      "step": 567
+    },
+    {
+      "epoch": 0.18176,
+      "grad_norm": 0.37858562503127563,
+      "learning_rate": 0.00018817226937711657,
+      "loss": 0.6884,
+      "step": 568
+    },
+    {
+      "epoch": 0.18208,
+      "grad_norm": 0.3745839077458561,
+      "learning_rate": 0.00018812332387033195,
+      "loss": 0.6947,
+      "step": 569
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3727408909978038,
+      "learning_rate": 0.00018807428369216822,
+      "loss": 0.629,
+      "step": 570
+    },
+    {
+      "epoch": 0.18272,
+      "grad_norm": 0.39548208964953424,
+      "learning_rate": 0.00018802514889530958,
+      "loss": 0.7309,
+      "step": 571
+    },
+    {
+      "epoch": 0.18304,
+      "grad_norm": 0.4082997422118062,
+      "learning_rate": 0.0001879759195325418,
+      "loss": 0.6722,
+      "step": 572
+    },
+    {
+      "epoch": 0.18336,
+      "grad_norm": 0.38057927200287106,
+      "learning_rate": 0.0001879265956567523,
+      "loss": 0.7027,
+      "step": 573
+    },
+    {
+      "epoch": 0.18368,
+      "grad_norm": 0.3559895974761702,
+      "learning_rate": 0.0001878771773209299,
+      "loss": 0.6578,
+      "step": 574
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.3745492614745584,
+      "learning_rate": 0.00018782766457816504,
+      "loss": 0.7065,
+      "step": 575
+    },
+    {
+      "epoch": 0.18432,
+      "grad_norm": 0.38370921252873413,
+      "learning_rate": 0.00018777805748164964,
+      "loss": 0.7137,
+      "step": 576
+    },
+    {
+      "epoch": 0.18464,
+      "grad_norm": 0.394791073280638,
+      "learning_rate": 0.0001877283560846767,
+      "loss": 0.6762,
+      "step": 577
+    },
+    {
+      "epoch": 0.18496,
+      "grad_norm": 0.3761152634292773,
+      "learning_rate": 0.00018767856044064085,
+      "loss": 0.7014,
+      "step": 578
+    },
+    {
+      "epoch": 0.18528,
+      "grad_norm": 0.36935472973616645,
+      "learning_rate": 0.00018762867060303774,
+      "loss": 0.6656,
+      "step": 579
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.44574612059928553,
+      "learning_rate": 0.00018757868662546437,
+      "loss": 0.646,
+      "step": 580
+    },
+    {
+      "epoch": 0.18592,
+      "grad_norm": 0.3664767911747923,
+      "learning_rate": 0.00018752860856161875,
+      "loss": 0.7,
+      "step": 581
+    },
+    {
+      "epoch": 0.18624,
+      "grad_norm": 0.37030213814865126,
+      "learning_rate": 0.00018747843646530006,
+      "loss": 0.7116,
+      "step": 582
+    },
+    {
+      "epoch": 0.18656,
+      "grad_norm": 0.3845202168507924,
+      "learning_rate": 0.00018742817039040844,
+      "loss": 0.7011,
+      "step": 583
+    },
+    {
+      "epoch": 0.18688,
+      "grad_norm": 0.37705149960866635,
+      "learning_rate": 0.00018737781039094502,
+      "loss": 0.6824,
+      "step": 584
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.37495802093433617,
+      "learning_rate": 0.00018732735652101184,
+      "loss": 0.6756,
+      "step": 585
+    },
+    {
+      "epoch": 0.18752,
+      "grad_norm": 0.3491924235481701,
+      "learning_rate": 0.0001872768088348118,
+      "loss": 0.7014,
+      "step": 586
+    },
+    {
+      "epoch": 0.18784,
+      "grad_norm": 0.3536450319376141,
+      "learning_rate": 0.00018722616738664851,
+      "loss": 0.6333,
+      "step": 587
+    },
+    {
+      "epoch": 0.18816,
+      "grad_norm": 0.3763522998163525,
+      "learning_rate": 0.00018717543223092638,
+      "loss": 0.7015,
+      "step": 588
+    },
+    {
+      "epoch": 0.18848,
+      "grad_norm": 0.4240282292057168,
+      "learning_rate": 0.00018712460342215046,
+      "loss": 0.6875,
+      "step": 589
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3976781443918028,
+      "learning_rate": 0.00018707368101492645,
+      "loss": 0.692,
+      "step": 590
+    },
+    {
+      "epoch": 0.18912,
+      "grad_norm": 0.36554055739748414,
+      "learning_rate": 0.00018702266506396057,
+      "loss": 0.7052,
+      "step": 591
+    },
+    {
+      "epoch": 0.18944,
+      "grad_norm": 0.4053088186534514,
+      "learning_rate": 0.00018697155562405953,
+      "loss": 0.7275,
+      "step": 592
+    },
+    {
+      "epoch": 0.18976,
+      "grad_norm": 0.34807908938544707,
+      "learning_rate": 0.00018692035275013046,
+      "loss": 0.6079,
+      "step": 593
+    },
+    {
+      "epoch": 0.19008,
+      "grad_norm": 0.36271006865430816,
+      "learning_rate": 0.00018686905649718095,
+      "loss": 0.6911,
+      "step": 594
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.3852183272838404,
+      "learning_rate": 0.0001868176669203188,
+      "loss": 0.7336,
+      "step": 595
+    },
+    {
+      "epoch": 0.19072,
+      "grad_norm": 0.36141733539153165,
+      "learning_rate": 0.00018676618407475218,
+      "loss": 0.6923,
+      "step": 596
+    },
+    {
+      "epoch": 0.19104,
+      "grad_norm": 0.37200882287899356,
+      "learning_rate": 0.00018671460801578932,
+      "loss": 0.6247,
+      "step": 597
+    },
+    {
+      "epoch": 0.19136,
+      "grad_norm": 0.42640403770624996,
+      "learning_rate": 0.00018666293879883875,
+      "loss": 0.7021,
+      "step": 598
+    },
+    {
+      "epoch": 0.19168,
+      "grad_norm": 0.3767707196253604,
+      "learning_rate": 0.00018661117647940896,
+      "loss": 0.6896,
+      "step": 599
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.44811369749897056,
+      "learning_rate": 0.00018655932111310848,
+      "loss": 0.7261,
+      "step": 600
+    },
+    {
+      "epoch": 0.19232,
+      "grad_norm": 0.3849516109506925,
+      "learning_rate": 0.00018650737275564583,
+      "loss": 0.7014,
+      "step": 601
+    },
+    {
+      "epoch": 0.19264,
+      "grad_norm": 0.37317183147710203,
+      "learning_rate": 0.00018645533146282946,
+      "loss": 0.6821,
+      "step": 602
+    },
+    {
+      "epoch": 0.19296,
+      "grad_norm": 0.37610310276188946,
+      "learning_rate": 0.00018640319729056753,
+      "loss": 0.6341,
+      "step": 603
+    },
+    {
+      "epoch": 0.19328,
+      "grad_norm": 0.3682102458628179,
+      "learning_rate": 0.0001863509702948682,
+      "loss": 0.666,
+      "step": 604
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.3860499748534257,
+      "learning_rate": 0.00018629865053183911,
+      "loss": 0.6757,
+      "step": 605
+    },
+    {
+      "epoch": 0.19392,
+      "grad_norm": 0.3856807367418405,
+      "learning_rate": 0.00018624623805768776,
+      "loss": 0.7219,
+      "step": 606
+    },
+    {
+      "epoch": 0.19424,
+      "grad_norm": 0.38209384833687055,
+      "learning_rate": 0.00018619373292872108,
+      "loss": 0.7063,
+      "step": 607
+    },
+    {
+      "epoch": 0.19456,
+      "grad_norm": 0.35863991143047497,
+      "learning_rate": 0.00018614113520134566,
+      "loss": 0.6827,
+      "step": 608
+    },
+    {
+      "epoch": 0.19488,
+      "grad_norm": 0.3823933385249185,
+      "learning_rate": 0.0001860884449320676,
+      "loss": 0.7287,
+      "step": 609
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3581417414576101,
+      "learning_rate": 0.00018603566217749223,
+      "loss": 0.7085,
+      "step": 610
+    },
+    {
+      "epoch": 0.19552,
+      "grad_norm": 0.377786033844024,
+      "learning_rate": 0.00018598278699432443,
+      "loss": 0.7461,
+      "step": 611
+    },
+    {
+      "epoch": 0.19584,
+      "grad_norm": 0.3927384796181169,
+      "learning_rate": 0.0001859298194393683,
+      "loss": 0.7145,
+      "step": 612
+    },
+    {
+      "epoch": 0.19616,
+      "grad_norm": 0.38126302858738637,
+      "learning_rate": 0.00018587675956952717,
+      "loss": 0.7056,
+      "step": 613
+    },
+    {
+      "epoch": 0.19648,
+      "grad_norm": 0.4017051851174728,
+      "learning_rate": 0.00018582360744180356,
+      "loss": 0.7194,
+      "step": 614
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.3832804262944955,
+      "learning_rate": 0.0001857703631132991,
+      "loss": 0.7236,
+      "step": 615
+    },
+    {
+      "epoch": 0.19712,
+      "grad_norm": 0.3835507525150527,
+      "learning_rate": 0.00018571702664121445,
+      "loss": 0.7127,
+      "step": 616
+    },
+    {
+      "epoch": 0.19744,
+      "grad_norm": 0.3717950930513399,
+      "learning_rate": 0.0001856635980828493,
+      "loss": 0.6778,
+      "step": 617
+    },
+    {
+      "epoch": 0.19776,
+      "grad_norm": 0.3892396454500002,
+      "learning_rate": 0.00018561007749560223,
+      "loss": 0.6843,
+      "step": 618
+    },
+    {
+      "epoch": 0.19808,
+      "grad_norm": 0.3884263240541612,
+      "learning_rate": 0.00018555646493697073,
+      "loss": 0.6794,
+      "step": 619
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.36032717315297097,
+      "learning_rate": 0.00018550276046455107,
+      "loss": 0.6353,
+      "step": 620
+    },
+    {
+      "epoch": 0.19872,
+      "grad_norm": 0.3717455125989437,
+      "learning_rate": 0.00018544896413603824,
+      "loss": 0.6894,
+      "step": 621
+    },
+    {
+      "epoch": 0.19904,
+      "grad_norm": 0.4029382846007775,
+      "learning_rate": 0.00018539507600922597,
+      "loss": 0.6444,
+      "step": 622
+    },
+    {
+      "epoch": 0.19936,
+      "grad_norm": 0.35177763402127044,
+      "learning_rate": 0.00018534109614200652,
+      "loss": 0.6194,
+      "step": 623
+    },
+    {
+      "epoch": 0.19968,
+      "grad_norm": 0.37288226756772497,
+      "learning_rate": 0.00018528702459237083,
+      "loss": 0.6918,
+      "step": 624
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.3649060754242833,
+      "learning_rate": 0.0001852328614184082,
+      "loss": 0.649,
+      "step": 625
+    },
+    {
+      "epoch": 0.20032,
+      "grad_norm": 0.36634977948284103,
+      "learning_rate": 0.00018517860667830648,
+      "loss": 0.7065,
+      "step": 626
+    },
+    {
+      "epoch": 0.20064,
+      "grad_norm": 0.37908346594012443,
+      "learning_rate": 0.00018512426043035184,
+      "loss": 0.7237,
+      "step": 627
+    },
+    {
+      "epoch": 0.20096,
+      "grad_norm": 0.34925470856243007,
+      "learning_rate": 0.00018506982273292874,
+      "loss": 0.7002,
+      "step": 628
+    },
+    {
+      "epoch": 0.20128,
+      "grad_norm": 0.34970529649550897,
+      "learning_rate": 0.00018501529364451993,
+      "loss": 0.6736,
+      "step": 629
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3554469104415349,
+      "learning_rate": 0.00018496067322370627,
+      "loss": 0.6963,
+      "step": 630
+    },
+    {
+      "epoch": 0.20192,
+      "grad_norm": 0.3551417578484709,
+      "learning_rate": 0.0001849059615291668,
+      "loss": 0.629,
+      "step": 631
+    },
+    {
+      "epoch": 0.20224,
+      "grad_norm": 0.3618812513414055,
+      "learning_rate": 0.0001848511586196786,
+      "loss": 0.6343,
+      "step": 632
+    },
+    {
+      "epoch": 0.20256,
+      "grad_norm": 0.3691591254135277,
+      "learning_rate": 0.00018479626455411677,
+      "loss": 0.6497,
+      "step": 633
+    },
+    {
+      "epoch": 0.20288,
+      "grad_norm": 0.3791082050849236,
+      "learning_rate": 0.00018474127939145424,
+      "loss": 0.7174,
+      "step": 634
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.3826804218084318,
+      "learning_rate": 0.00018468620319076197,
+      "loss": 0.6696,
+      "step": 635
+    },
+    {
+      "epoch": 0.20352,
+      "grad_norm": 0.38073044965980335,
+      "learning_rate": 0.00018463103601120857,
+      "loss": 0.6615,
+      "step": 636
+    },
+    {
+      "epoch": 0.20384,
+      "grad_norm": 0.35355843658521047,
+      "learning_rate": 0.00018457577791206048,
+      "loss": 0.6882,
+      "step": 637
+    },
+    {
+      "epoch": 0.20416,
+      "grad_norm": 0.36121294050894603,
+      "learning_rate": 0.0001845204289526817,
+      "loss": 0.6584,
+      "step": 638
+    },
+    {
+      "epoch": 0.20448,
+      "grad_norm": 0.3349490954718071,
+      "learning_rate": 0.00018446498919253408,
+      "loss": 0.64,
+      "step": 639
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.37603377394994797,
+      "learning_rate": 0.00018440945869117675,
+      "loss": 0.7433,
+      "step": 640
+    },
+    {
+      "epoch": 0.20512,
+      "grad_norm": 0.3527726682879851,
+      "learning_rate": 0.00018435383750826643,
+      "loss": 0.6961,
+      "step": 641
+    },
+    {
+      "epoch": 0.20544,
+      "grad_norm": 0.45068127304993216,
+      "learning_rate": 0.00018429812570355732,
+      "loss": 0.642,
+      "step": 642
+    },
+    {
+      "epoch": 0.20576,
+      "grad_norm": 0.38538800083322367,
+      "learning_rate": 0.00018424232333690094,
+      "loss": 0.6647,
+      "step": 643
+    },
+    {
+      "epoch": 0.20608,
+      "grad_norm": 0.36591788596065095,
+      "learning_rate": 0.00018418643046824604,
+      "loss": 0.712,
+      "step": 644
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.3707034695442753,
+      "learning_rate": 0.00018413044715763862,
+      "loss": 0.7094,
+      "step": 645
+    },
+    {
+      "epoch": 0.20672,
+      "grad_norm": 0.4120339445920212,
+      "learning_rate": 0.00018407437346522194,
+      "loss": 0.7112,
+      "step": 646
+    },
+    {
+      "epoch": 0.20704,
+      "grad_norm": 0.3790663390655863,
+      "learning_rate": 0.0001840182094512362,
+      "loss": 0.7387,
+      "step": 647
+    },
+    {
+      "epoch": 0.20736,
+      "grad_norm": 0.3467089104227242,
+      "learning_rate": 0.00018396195517601875,
+      "loss": 0.6439,
+      "step": 648
+    },
+    {
+      "epoch": 0.20768,
+      "grad_norm": 0.3625763564568996,
+      "learning_rate": 0.00018390561070000388,
+      "loss": 0.6602,
+      "step": 649
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.37642659346842217,
+      "learning_rate": 0.00018384917608372278,
+      "loss": 0.696,
+      "step": 650
+    },
+    {
+      "epoch": 0.20832,
+      "grad_norm": 0.37929555899812895,
+      "learning_rate": 0.00018379265138780343,
+      "loss": 0.7112,
+      "step": 651
+    },
+    {
+      "epoch": 0.20864,
+      "grad_norm": 0.374471560265218,
+      "learning_rate": 0.00018373603667297067,
+      "loss": 0.6724,
+      "step": 652
+    },
+    {
+      "epoch": 0.20896,
+      "grad_norm": 0.38324649849294595,
+      "learning_rate": 0.000183679332000046,
+      "loss": 0.6798,
+      "step": 653
+    },
+    {
+      "epoch": 0.20928,
+      "grad_norm": 0.37878220851681876,
+      "learning_rate": 0.00018362253742994756,
+      "loss": 0.6337,
+      "step": 654
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.392281924972558,
+      "learning_rate": 0.0001835656530236901,
+      "loss": 0.7025,
+      "step": 655
+    },
+    {
+      "epoch": 0.20992,
+      "grad_norm": 0.3602190368976346,
+      "learning_rate": 0.00018350867884238476,
+      "loss": 0.6626,
+      "step": 656
+    },
+    {
+      "epoch": 0.21024,
+      "grad_norm": 0.37026535929650906,
+      "learning_rate": 0.00018345161494723935,
+      "loss": 0.6639,
+      "step": 657
+    },
+    {
+      "epoch": 0.21056,
+      "grad_norm": 0.36671926006499656,
+      "learning_rate": 0.00018339446139955783,
+      "loss": 0.657,
+      "step": 658
+    },
+    {
+      "epoch": 0.21088,
+      "grad_norm": 0.3575424799140936,
+      "learning_rate": 0.00018333721826074064,
+      "loss": 0.6745,
+      "step": 659
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.37607042388640194,
+      "learning_rate": 0.00018327988559228438,
+      "loss": 0.6968,
+      "step": 660
+    },
+    {
+      "epoch": 0.21152,
+      "grad_norm": 0.3678610855760465,
+      "learning_rate": 0.0001832224634557818,
+      "loss": 0.6675,
+      "step": 661
+    },
+    {
+      "epoch": 0.21184,
+      "grad_norm": 0.3868273786573624,
+      "learning_rate": 0.00018316495191292195,
+      "loss": 0.6945,
+      "step": 662
+    },
+    {
+      "epoch": 0.21216,
+      "grad_norm": 0.3775942900351484,
+      "learning_rate": 0.00018310735102548972,
+      "loss": 0.6907,
+      "step": 663
+    },
+    {
+      "epoch": 0.21248,
+      "grad_norm": 0.3593499650917196,
+      "learning_rate": 0.00018304966085536602,
+      "loss": 0.6227,
+      "step": 664
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.43533651316148203,
+      "learning_rate": 0.0001829918814645278,
+      "loss": 0.6884,
+      "step": 665
+    },
+    {
+      "epoch": 0.21312,
+      "grad_norm": 1.39738823936067,
+      "learning_rate": 0.0001829340129150478,
+      "loss": 0.6519,
+      "step": 666
+    },
+    {
+      "epoch": 0.21344,
+      "grad_norm": 0.3893288733656194,
+      "learning_rate": 0.00018287605526909445,
+      "loss": 0.6419,
+      "step": 667
+    },
+    {
+      "epoch": 0.21376,
+      "grad_norm": 0.35844663505549856,
+      "learning_rate": 0.00018281800858893204,
+      "loss": 0.671,
+      "step": 668
+    },
+    {
+      "epoch": 0.21408,
+      "grad_norm": 0.34876632435901844,
+      "learning_rate": 0.00018275987293692034,
+      "loss": 0.6805,
+      "step": 669
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.3714197501969998,
+      "learning_rate": 0.00018270164837551494,
+      "loss": 0.6589,
+      "step": 670
+    },
+    {
+      "epoch": 0.21472,
+      "grad_norm": 0.3730668762325715,
+      "learning_rate": 0.0001826433349672667,
+      "loss": 0.6848,
+      "step": 671
+    },
+    {
+      "epoch": 0.21504,
+      "grad_norm": 0.38536987660894045,
+      "learning_rate": 0.00018258493277482213,
+      "loss": 0.6877,
+      "step": 672
+    },
+    {
+      "epoch": 0.21536,
+      "grad_norm": 0.4048527650106258,
+      "learning_rate": 0.00018252644186092298,
+      "loss": 0.7041,
+      "step": 673
+    },
+    {
+      "epoch": 0.21568,
+      "grad_norm": 0.3757092389015418,
+      "learning_rate": 0.00018246786228840635,
+      "loss": 0.7229,
+      "step": 674
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.39499355090932425,
+      "learning_rate": 0.00018240919412020466,
+      "loss": 0.6775,
+      "step": 675
+    },
+    {
+      "epoch": 0.21632,
+      "grad_norm": 0.3703427030667556,
+      "learning_rate": 0.0001823504374193454,
+      "loss": 0.6511,
+      "step": 676
+    },
+    {
+      "epoch": 0.21664,
+      "grad_norm": 0.38373506154959813,
+      "learning_rate": 0.00018229159224895122,
+      "loss": 0.6533,
+      "step": 677
+    },
+    {
+      "epoch": 0.21696,
+      "grad_norm": 0.3696082782693057,
+      "learning_rate": 0.00018223265867223985,
+      "loss": 0.7134,
+      "step": 678
+    },
+    {
+      "epoch": 0.21728,
+      "grad_norm": 0.3797972880923753,
+      "learning_rate": 0.00018217363675252396,
+      "loss": 0.6932,
+      "step": 679
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.37870990288539186,
+      "learning_rate": 0.00018211452655321112,
+      "loss": 0.6726,
+      "step": 680
+    },
+    {
+      "epoch": 0.21792,
+      "grad_norm": 0.361952971987553,
+      "learning_rate": 0.0001820553281378037,
+      "loss": 0.6157,
+      "step": 681
+    },
+    {
+      "epoch": 0.21824,
+      "grad_norm": 0.3972171846290433,
+      "learning_rate": 0.00018199604156989897,
+      "loss": 0.6592,
+      "step": 682
+    },
+    {
+      "epoch": 0.21856,
+      "grad_norm": 0.36658247890412493,
+      "learning_rate": 0.00018193666691318874,
+      "loss": 0.67,
+      "step": 683
+    },
+    {
+      "epoch": 0.21888,
+      "grad_norm": 0.4110491988798091,
+      "learning_rate": 0.0001818772042314596,
+      "loss": 0.699,
+      "step": 684
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.3611727041903771,
+      "learning_rate": 0.00018181765358859261,
+      "loss": 0.7139,
+      "step": 685
+    },
+    {
+      "epoch": 0.21952,
+      "grad_norm": 0.386263582558337,
+      "learning_rate": 0.00018175801504856335,
+      "loss": 0.6923,
+      "step": 686
+    },
+    {
+      "epoch": 0.21984,
+      "grad_norm": 0.38675112190263383,
+      "learning_rate": 0.00018169828867544186,
+      "loss": 0.6989,
+      "step": 687
+    },
+    {
+      "epoch": 0.22016,
+      "grad_norm": 0.3674138936551115,
+      "learning_rate": 0.0001816384745333925,
+      "loss": 0.6743,
+      "step": 688
+    },
+    {
+      "epoch": 0.22048,
+      "grad_norm": 0.4213474729444471,
+      "learning_rate": 0.00018157857268667396,
+      "loss": 0.7251,
+      "step": 689
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.39289033366716836,
+      "learning_rate": 0.00018151858319963914,
+      "loss": 0.7161,
+      "step": 690
+    },
+    {
+      "epoch": 0.22112,
+      "grad_norm": 0.4010299377303811,
+      "learning_rate": 0.00018145850613673502,
+      "loss": 0.6701,
+      "step": 691
+    },
+    {
+      "epoch": 0.22144,
+      "grad_norm": 0.3978792715503202,
+      "learning_rate": 0.00018139834156250277,
+      "loss": 0.7011,
+      "step": 692
+    },
+    {
+      "epoch": 0.22176,
+      "grad_norm": 0.35428077731562635,
+      "learning_rate": 0.00018133808954157749,
+      "loss": 0.6718,
+      "step": 693
+    },
+    {
+      "epoch": 0.22208,
+      "grad_norm": 0.3692508625538983,
+      "learning_rate": 0.00018127775013868834,
+      "loss": 0.6909,
+      "step": 694
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.36310482844960207,
+      "learning_rate": 0.00018121732341865818,
+      "loss": 0.6494,
+      "step": 695
+    },
+    {
+      "epoch": 0.22272,
+      "grad_norm": 0.35451425732355163,
+      "learning_rate": 0.00018115680944640384,
+      "loss": 0.5966,
+      "step": 696
+    },
+    {
+      "epoch": 0.22304,
+      "grad_norm": 0.3759293895978303,
+      "learning_rate": 0.0001810962082869358,
+      "loss": 0.6973,
+      "step": 697
+    },
+    {
+      "epoch": 0.22336,
+      "grad_norm": 0.3968830045056176,
+      "learning_rate": 0.00018103552000535818,
+      "loss": 0.6588,
+      "step": 698
+    },
+    {
+      "epoch": 0.22368,
+      "grad_norm": 0.37493430010283585,
+      "learning_rate": 0.00018097474466686884,
+      "loss": 0.7354,
+      "step": 699
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4162348933142053,
+      "learning_rate": 0.00018091388233675896,
+      "loss": 0.6686,
+      "step": 700
+    },
+    {
+      "epoch": 0.22432,
+      "grad_norm": 0.5505324084612641,
+      "learning_rate": 0.00018085293308041335,
+      "loss": 0.6886,
+      "step": 701
+    },
+    {
+      "epoch": 0.22464,
+      "grad_norm": 0.4116182971170426,
+      "learning_rate": 0.00018079189696331013,
+      "loss": 0.7314,
+      "step": 702
+    },
+    {
+      "epoch": 0.22496,
+      "grad_norm": 0.4153567985231021,
+      "learning_rate": 0.00018073077405102072,
+      "loss": 0.6491,
+      "step": 703
+    },
+    {
+      "epoch": 0.22528,
+      "grad_norm": 0.3764911933775025,
+      "learning_rate": 0.0001806695644092098,
+      "loss": 0.6686,
+      "step": 704
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.3876964994224437,
+      "learning_rate": 0.00018060826810363523,
+      "loss": 0.7036,
+      "step": 705
+    },
+    {
+      "epoch": 0.22592,
+      "grad_norm": 0.35928093842692577,
+      "learning_rate": 0.000180546885200148,
+      "loss": 0.7131,
+      "step": 706
+    },
+    {
+      "epoch": 0.22624,
+      "grad_norm": 0.3708341293662108,
+      "learning_rate": 0.0001804854157646921,
+      "loss": 0.6764,
+      "step": 707
+    },
+    {
+      "epoch": 0.22656,
+      "grad_norm": 0.3641271955234546,
+      "learning_rate": 0.00018042385986330448,
+      "loss": 0.7012,
+      "step": 708
+    },
+    {
+      "epoch": 0.22688,
+      "grad_norm": 0.351875239335879,
+      "learning_rate": 0.000180362217562115,
+      "loss": 0.6925,
+      "step": 709
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.374666539626361,
+      "learning_rate": 0.0001803004889273463,
+      "loss": 0.6703,
+      "step": 710
+    },
+    {
+      "epoch": 0.22752,
+      "grad_norm": 0.36141450149841525,
+      "learning_rate": 0.0001802386740253138,
+      "loss": 0.6586,
+      "step": 711
+    },
+    {
+      "epoch": 0.22784,
+      "grad_norm": 0.3879825984734728,
+      "learning_rate": 0.00018017677292242562,
+      "loss": 0.6951,
+      "step": 712
+    },
+    {
+      "epoch": 0.22816,
+      "grad_norm": 0.3880697443654585,
+      "learning_rate": 0.00018011478568518246,
+      "loss": 0.6729,
+      "step": 713
+    },
+    {
+      "epoch": 0.22848,
+      "grad_norm": 0.36702448036834806,
+      "learning_rate": 0.00018005271238017754,
+      "loss": 0.6813,
+      "step": 714
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.3654198048597913,
+      "learning_rate": 0.00017999055307409657,
+      "loss": 0.6694,
+      "step": 715
+    },
+    {
+      "epoch": 0.22912,
+      "grad_norm": 0.3544780790270675,
+      "learning_rate": 0.00017992830783371763,
+      "loss": 0.681,
+      "step": 716
+    },
+    {
+      "epoch": 0.22944,
+      "grad_norm": 0.36471597622987206,
+      "learning_rate": 0.00017986597672591111,
+      "loss": 0.6634,
+      "step": 717
+    },
+    {
+      "epoch": 0.22976,
+      "grad_norm": 0.3746561832452932,
+      "learning_rate": 0.00017980355981763973,
+      "loss": 0.7001,
+      "step": 718
+    },
+    {
+      "epoch": 0.23008,
+      "grad_norm": 0.3789342838340834,
+      "learning_rate": 0.00017974105717595825,
+      "loss": 0.7074,
+      "step": 719
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3802972783120846,
+      "learning_rate": 0.00017967846886801365,
+      "loss": 0.7069,
+      "step": 720
+    },
+    {
+      "epoch": 0.23072,
+      "grad_norm": 0.36526081968866997,
+      "learning_rate": 0.00017961579496104488,
+      "loss": 0.6373,
+      "step": 721
+    },
+    {
+      "epoch": 0.23104,
+      "grad_norm": 0.3672669737592333,
+      "learning_rate": 0.0001795530355223829,
+      "loss": 0.632,
+      "step": 722
+    },
+    {
+      "epoch": 0.23136,
+      "grad_norm": 0.3678652252568809,
+      "learning_rate": 0.00017949019061945046,
+      "loss": 0.6478,
+      "step": 723
+    },
+    {
+      "epoch": 0.23168,
+      "grad_norm": 0.39822846891907343,
+      "learning_rate": 0.0001794272603197623,
+      "loss": 0.7088,
+      "step": 724
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.40129414401337665,
+      "learning_rate": 0.00017936424469092467,
+      "loss": 0.6848,
+      "step": 725
+    },
+    {
+      "epoch": 0.23232,
+      "grad_norm": 0.4058611280373537,
+      "learning_rate": 0.00017930114380063566,
+      "loss": 0.7179,
+      "step": 726
+    },
+    {
+      "epoch": 0.23264,
+      "grad_norm": 0.3657740769226459,
+      "learning_rate": 0.00017923795771668493,
+      "loss": 0.6549,
+      "step": 727
+    },
+    {
+      "epoch": 0.23296,
+      "grad_norm": 0.385324554704592,
+      "learning_rate": 0.00017917468650695365,
+      "loss": 0.6989,
+      "step": 728
+    },
+    {
+      "epoch": 0.23328,
+      "grad_norm": 0.3931880693189958,
+      "learning_rate": 0.00017911133023941443,
+      "loss": 0.6615,
+      "step": 729
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3738320684033318,
+      "learning_rate": 0.0001790478889821312,
+      "loss": 0.6984,
+      "step": 730
+    },
+    {
+      "epoch": 0.23392,
+      "grad_norm": 0.37472414578951646,
+      "learning_rate": 0.0001789843628032593,
+      "loss": 0.7068,
+      "step": 731
+    },
+    {
+      "epoch": 0.23424,
+      "grad_norm": 0.3720541748071402,
+      "learning_rate": 0.0001789207517710453,
+      "loss": 0.6493,
+      "step": 732
+    },
+    {
+      "epoch": 0.23456,
+      "grad_norm": 0.49658220251926044,
+      "learning_rate": 0.00017885705595382682,
+      "loss": 0.6343,
+      "step": 733
+    },
+    {
+      "epoch": 0.23488,
+      "grad_norm": 0.4092338956235856,
+      "learning_rate": 0.00017879327542003265,
+      "loss": 0.7418,
+      "step": 734
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.38867382782007576,
+      "learning_rate": 0.0001787294102381826,
+      "loss": 0.6749,
+      "step": 735
+    },
+    {
+      "epoch": 0.23552,
+      "grad_norm": 0.3829738650156427,
+      "learning_rate": 0.00017866546047688736,
+      "loss": 0.6503,
+      "step": 736
+    },
+    {
+      "epoch": 0.23584,
+      "grad_norm": 0.3457762290921722,
+      "learning_rate": 0.0001786014262048486,
+      "loss": 0.6054,
+      "step": 737
+    },
+    {
+      "epoch": 0.23616,
+      "grad_norm": 0.3950841371672311,
+      "learning_rate": 0.00017853730749085856,
+      "loss": 0.6663,
+      "step": 738
+    },
+    {
+      "epoch": 0.23648,
+      "grad_norm": 0.3921900649317959,
+      "learning_rate": 0.0001784731044038004,
+      "loss": 0.7087,
+      "step": 739
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.37203824026401816,
+      "learning_rate": 0.0001784088170126479,
+      "loss": 0.6614,
+      "step": 740
+    },
+    {
+      "epoch": 0.23712,
+      "grad_norm": 0.4169831346176795,
+      "learning_rate": 0.00017834444538646527,
+      "loss": 0.6891,
+      "step": 741
+    },
+    {
+      "epoch": 0.23744,
+      "grad_norm": 0.3430654181829531,
+      "learning_rate": 0.00017827998959440736,
+      "loss": 0.6283,
+      "step": 742
+    },
+    {
+      "epoch": 0.23776,
+      "grad_norm": 0.3723546203784155,
+      "learning_rate": 0.0001782154497057194,
+      "loss": 0.6607,
+      "step": 743
+    },
+    {
+      "epoch": 0.23808,
+      "grad_norm": 0.38409773935642044,
+      "learning_rate": 0.00017815082578973693,
+      "loss": 0.6974,
+      "step": 744
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.3529225958218236,
+      "learning_rate": 0.00017808611791588584,
+      "loss": 0.6461,
+      "step": 745
+    },
+    {
+      "epoch": 0.23872,
+      "grad_norm": 0.38105695662774514,
+      "learning_rate": 0.00017802132615368205,
+      "loss": 0.713,
+      "step": 746
+    },
+    {
+      "epoch": 0.23904,
+      "grad_norm": 0.3926402910525965,
+      "learning_rate": 0.00017795645057273177,
+      "loss": 0.689,
+      "step": 747
+    },
+    {
+      "epoch": 0.23936,
+      "grad_norm": 0.3790104979677121,
+      "learning_rate": 0.00017789149124273123,
+      "loss": 0.6834,
+      "step": 748
+    },
+    {
+      "epoch": 0.23968,
+      "grad_norm": 0.4045459184686704,
+      "learning_rate": 0.00017782644823346658,
+      "loss": 0.6446,
+      "step": 749
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.40319121564248955,
+      "learning_rate": 0.00017776132161481385,
+      "loss": 0.6753,
+      "step": 750
+    },
+    {
+      "epoch": 0.24032,
+      "grad_norm": 0.3634116972046028,
+      "learning_rate": 0.000177696111456739,
+      "loss": 0.6109,
+      "step": 751
+    },
+    {
+      "epoch": 0.24064,
+      "grad_norm": 0.3781068354581363,
+      "learning_rate": 0.00017763081782929757,
+      "loss": 0.665,
+      "step": 752
+    },
+    {
+      "epoch": 0.24096,
+      "grad_norm": 0.39310915820479103,
+      "learning_rate": 0.00017756544080263495,
+      "loss": 0.6554,
+      "step": 753
+    },
+    {
+      "epoch": 0.24128,
+      "grad_norm": 0.3761835536526668,
+      "learning_rate": 0.00017749998044698607,
+      "loss": 0.6648,
+      "step": 754
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.3748500098790218,
+      "learning_rate": 0.00017743443683267525,
+      "loss": 0.7238,
+      "step": 755
+    },
+    {
+      "epoch": 0.24192,
+      "grad_norm": 0.3506351603982856,
+      "learning_rate": 0.00017736881003011643,
+      "loss": 0.6503,
+      "step": 756
+    },
+    {
+      "epoch": 0.24224,
+      "grad_norm": 0.3604576790932431,
+      "learning_rate": 0.00017730310010981285,
+      "loss": 0.6839,
+      "step": 757
+    },
+    {
+      "epoch": 0.24256,
+      "grad_norm": 0.3789260711437009,
+      "learning_rate": 0.00017723730714235705,
+      "loss": 0.6355,
+      "step": 758
+    },
+    {
+      "epoch": 0.24288,
+      "grad_norm": 0.36539702949886815,
+      "learning_rate": 0.00017717143119843075,
+      "loss": 0.6508,
+      "step": 759
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3897347490416631,
+      "learning_rate": 0.00017710547234880486,
+      "loss": 0.655,
+      "step": 760
+    },
+    {
+      "epoch": 0.24352,
+      "grad_norm": 0.38799776162309213,
+      "learning_rate": 0.00017703943066433935,
+      "loss": 0.6799,
+      "step": 761
+    },
+    {
+      "epoch": 0.24384,
+      "grad_norm": 0.34789732571574994,
+      "learning_rate": 0.00017697330621598313,
+      "loss": 0.5818,
+      "step": 762
+    },
+    {
+      "epoch": 0.24416,
+      "grad_norm": 0.3681128745265903,
+      "learning_rate": 0.00017690709907477412,
+      "loss": 0.6985,
+      "step": 763
+    },
+    {
+      "epoch": 0.24448,
+      "grad_norm": 0.36664982899708504,
+      "learning_rate": 0.000176840809311839,
+      "loss": 0.6823,
+      "step": 764
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.369593144412236,
+      "learning_rate": 0.0001767744369983932,
+      "loss": 0.7022,
+      "step": 765
+    },
+    {
+      "epoch": 0.24512,
+      "grad_norm": 0.3696850026902434,
+      "learning_rate": 0.0001767079822057409,
+      "loss": 0.7249,
+      "step": 766
+    },
+    {
+      "epoch": 0.24544,
+      "grad_norm": 0.36928768768576387,
+      "learning_rate": 0.0001766414450052749,
+      "loss": 0.707,
+      "step": 767
+    },
+    {
+      "epoch": 0.24576,
+      "grad_norm": 0.3608620469117216,
+      "learning_rate": 0.0001765748254684764,
+      "loss": 0.6627,
+      "step": 768
+    },
+    {
+      "epoch": 0.24608,
+      "grad_norm": 0.35888561707044647,
+      "learning_rate": 0.0001765081236669152,
+      "loss": 0.6559,
+      "step": 769
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.34718463650647857,
+      "learning_rate": 0.0001764413396722494,
+      "loss": 0.648,
+      "step": 770
+    },
+    {
+      "epoch": 0.24672,
+      "grad_norm": 0.3535437473758207,
+      "learning_rate": 0.00017637447355622538,
+      "loss": 0.6596,
+      "step": 771
+    },
+    {
+      "epoch": 0.24704,
+      "grad_norm": 0.38919238187688104,
+      "learning_rate": 0.00017630752539067785,
+      "loss": 0.6867,
+      "step": 772
+    },
+    {
+      "epoch": 0.24736,
+      "grad_norm": 0.36571627750283614,
+      "learning_rate": 0.00017624049524752954,
+      "loss": 0.6347,
+      "step": 773
+    },
+    {
+      "epoch": 0.24768,
+      "grad_norm": 0.35948035619621493,
+      "learning_rate": 0.00017617338319879136,
+      "loss": 0.6954,
+      "step": 774
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.3636966174866779,
+      "learning_rate": 0.0001761061893165621,
+      "loss": 0.6368,
+      "step": 775
+    },
+    {
+      "epoch": 0.24832,
+      "grad_norm": 0.3623952898853704,
+      "learning_rate": 0.0001760389136730286,
+      "loss": 0.6919,
+      "step": 776
+    },
+    {
+      "epoch": 0.24864,
+      "grad_norm": 0.3744531382753775,
+      "learning_rate": 0.00017597155634046537,
+      "loss": 0.6654,
+      "step": 777
+    },
+    {
+      "epoch": 0.24896,
+      "grad_norm": 0.38270320727918494,
+      "learning_rate": 0.00017590411739123484,
+      "loss": 0.6765,
+      "step": 778
+    },
+    {
+      "epoch": 0.24928,
+      "grad_norm": 0.34822030214222854,
+      "learning_rate": 0.000175836596897787,
+      "loss": 0.6288,
+      "step": 779
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.38997789732359966,
+      "learning_rate": 0.00017576899493265954,
+      "loss": 0.6895,
+      "step": 780
+    },
+    {
+      "epoch": 0.24992,
+      "grad_norm": 0.3710814808434627,
+      "learning_rate": 0.00017570131156847756,
+      "loss": 0.6511,
+      "step": 781
+    },
+    {
+      "epoch": 0.25024,
+      "grad_norm": 0.37325666573165384,
+      "learning_rate": 0.00017563354687795375,
+      "loss": 0.7231,
+      "step": 782
+    },
+    {
+      "epoch": 0.25056,
+      "grad_norm": 0.38392593308239936,
+      "learning_rate": 0.00017556570093388806,
+      "loss": 0.6574,
+      "step": 783
+    },
+    {
+      "epoch": 0.25088,
+      "grad_norm": 0.42112599740387235,
+      "learning_rate": 0.00017549777380916777,
+      "loss": 0.6723,
+      "step": 784
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3521731958080482,
+      "learning_rate": 0.00017542976557676738,
+      "loss": 0.6597,
+      "step": 785
+    },
+    {
+      "epoch": 0.25152,
+      "grad_norm": 0.3851579490484671,
+      "learning_rate": 0.00017536167630974854,
+      "loss": 0.6971,
+      "step": 786
+    },
+    {
+      "epoch": 0.25184,
+      "grad_norm": 0.41011682634144714,
+      "learning_rate": 0.00017529350608125986,
+      "loss": 0.7226,
+      "step": 787
+    },
+    {
+      "epoch": 0.25216,
+      "grad_norm": 0.39012475246529094,
+      "learning_rate": 0.00017522525496453702,
+      "loss": 0.7073,
+      "step": 788
+    },
+    {
+      "epoch": 0.25248,
+      "grad_norm": 0.37831221735486503,
+      "learning_rate": 0.00017515692303290262,
+      "loss": 0.6941,
+      "step": 789
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.3961407873087152,
+      "learning_rate": 0.00017508851035976598,
+      "loss": 0.6362,
+      "step": 790
+    },
+    {
+      "epoch": 0.25312,
+      "grad_norm": 0.38372475028444036,
+      "learning_rate": 0.00017502001701862323,
+      "loss": 0.7038,
+      "step": 791
+    },
+    {
+      "epoch": 0.25344,
+      "grad_norm": 0.39253207378164034,
+      "learning_rate": 0.0001749514430830572,
+      "loss": 0.7087,
+      "step": 792
+    },
+    {
+      "epoch": 0.25376,
+      "grad_norm": 0.37536172724992684,
+      "learning_rate": 0.0001748827886267372,
+      "loss": 0.6629,
+      "step": 793
+    },
+    {
+      "epoch": 0.25408,
+      "grad_norm": 0.37939017436966077,
+      "learning_rate": 0.0001748140537234191,
+      "loss": 0.7054,
+      "step": 794
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4008613066398397,
+      "learning_rate": 0.00017474523844694518,
+      "loss": 0.6793,
+      "step": 795
+    },
+    {
+      "epoch": 0.25472,
+      "grad_norm": 0.37880333252362164,
+      "learning_rate": 0.00017467634287124414,
+      "loss": 0.7231,
+      "step": 796
+    },
+    {
+      "epoch": 0.25504,
+      "grad_norm": 0.362347756950756,
+      "learning_rate": 0.0001746073670703308,
+      "loss": 0.7087,
+      "step": 797
+    },
+    {
+      "epoch": 0.25536,
+      "grad_norm": 0.3660111047422492,
+      "learning_rate": 0.00017453831111830632,
+      "loss": 0.6523,
+      "step": 798
+    },
+    {
+      "epoch": 0.25568,
+      "grad_norm": 0.40853376479823095,
+      "learning_rate": 0.00017446917508935785,
+      "loss": 0.6982,
+      "step": 799
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3680173008859809,
+      "learning_rate": 0.0001743999590577586,
+      "loss": 0.7078,
+      "step": 800
+    },
+    {
+      "epoch": 0.25632,
+      "grad_norm": 0.38643790020373076,
+      "learning_rate": 0.00017433066309786779,
+      "loss": 0.6959,
+      "step": 801
+    },
+    {
+      "epoch": 0.25664,
+      "grad_norm": 0.39890479799005363,
+      "learning_rate": 0.0001742612872841304,
+      "loss": 0.7043,
+      "step": 802
+    },
+    {
+      "epoch": 0.25696,
+      "grad_norm": 0.3647697505304056,
+      "learning_rate": 0.00017419183169107728,
+      "loss": 0.6646,
+      "step": 803
+    },
+    {
+      "epoch": 0.25728,
+      "grad_norm": 0.38063519440723825,
+      "learning_rate": 0.00017412229639332497,
+      "loss": 0.6662,
+      "step": 804
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.3613607544656473,
+      "learning_rate": 0.00017405268146557565,
+      "loss": 0.6604,
+      "step": 805
+    },
+    {
+      "epoch": 0.25792,
+      "grad_norm": 0.3705790308535535,
+      "learning_rate": 0.00017398298698261696,
+      "loss": 0.6854,
+      "step": 806
+    },
+    {
+      "epoch": 0.25824,
+      "grad_norm": 0.369450868677443,
+      "learning_rate": 0.00017391321301932217,
+      "loss": 0.6787,
+      "step": 807
+    },
+    {
+      "epoch": 0.25856,
+      "grad_norm": 0.34498991261234885,
+      "learning_rate": 0.00017384335965064972,
+      "loss": 0.6087,
+      "step": 808
+    },
+    {
+      "epoch": 0.25888,
+      "grad_norm": 0.3544005279140098,
+      "learning_rate": 0.00017377342695164356,
+      "loss": 0.6523,
+      "step": 809
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3703786236785271,
+      "learning_rate": 0.00017370341499743278,
+      "loss": 0.6886,
+      "step": 810
+    },
+    {
+      "epoch": 0.25952,
+      "grad_norm": 0.3586889275901231,
+      "learning_rate": 0.00017363332386323156,
+      "loss": 0.6845,
+      "step": 811
+    },
+    {
+      "epoch": 0.25984,
+      "grad_norm": 0.35213584457379365,
+      "learning_rate": 0.0001735631536243392,
+      "loss": 0.6585,
+      "step": 812
+    },
+    {
+      "epoch": 0.26016,
+      "grad_norm": 0.4054390943539348,
+      "learning_rate": 0.00017349290435614,
+      "loss": 0.6519,
+      "step": 813
+    },
+    {
+      "epoch": 0.26048,
+      "grad_norm": 0.36669723823752626,
+      "learning_rate": 0.0001734225761341032,
+      "loss": 0.697,
+      "step": 814
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.3579772708248754,
+      "learning_rate": 0.00017335216903378267,
+      "loss": 0.7142,
+      "step": 815
+    },
+    {
+      "epoch": 0.26112,
+      "grad_norm": 0.37007009053306683,
+      "learning_rate": 0.00017328168313081728,
+      "loss": 0.6559,
+      "step": 816
+    },
+    {
+      "epoch": 0.26144,
+      "grad_norm": 0.36960486757346084,
+      "learning_rate": 0.00017321111850093036,
+      "loss": 0.67,
+      "step": 817
+    },
+    {
+      "epoch": 0.26176,
+      "grad_norm": 0.3979872534062356,
+      "learning_rate": 0.00017314047521992993,
+      "loss": 0.7315,
+      "step": 818
+    },
+    {
+      "epoch": 0.26208,
+      "grad_norm": 0.3629816736881855,
+      "learning_rate": 0.0001730697533637084,
+      "loss": 0.6971,
+      "step": 819
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.34987269536959664,
+      "learning_rate": 0.0001729989530082427,
+      "loss": 0.6862,
+      "step": 820
+    },
+    {
+      "epoch": 0.26272,
+      "grad_norm": 0.3506826922755894,
+      "learning_rate": 0.00017292807422959402,
+      "loss": 0.6874,
+      "step": 821
+    },
+    {
+      "epoch": 0.26304,
+      "grad_norm": 0.35080493419930325,
+      "learning_rate": 0.00017285711710390787,
+      "loss": 0.6482,
+      "step": 822
+    },
+    {
+      "epoch": 0.26336,
+      "grad_norm": 0.36295944138367087,
+      "learning_rate": 0.00017278608170741383,
+      "loss": 0.7002,
+      "step": 823
+    },
+    {
+      "epoch": 0.26368,
+      "grad_norm": 0.3819450634574015,
+      "learning_rate": 0.0001727149681164257,
+      "loss": 0.709,
+      "step": 824
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3736935737242355,
+      "learning_rate": 0.00017264377640734114,
+      "loss": 0.6962,
+      "step": 825
+    },
+    {
+      "epoch": 0.26432,
+      "grad_norm": 0.38204111246911654,
+      "learning_rate": 0.00017257250665664179,
+      "loss": 0.691,
+      "step": 826
+    },
+    {
+      "epoch": 0.26464,
+      "grad_norm": 0.3513189667088068,
+      "learning_rate": 0.00017250115894089322,
+      "loss": 0.6449,
+      "step": 827
+    },
+    {
+      "epoch": 0.26496,
+      "grad_norm": 0.3644897631112619,
+      "learning_rate": 0.0001724297333367446,
+      "loss": 0.6863,
+      "step": 828
+    },
+    {
+      "epoch": 0.26528,
+      "grad_norm": 0.36027334107647974,
+      "learning_rate": 0.00017235822992092893,
+      "loss": 0.6706,
+      "step": 829
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.35122715715410546,
+      "learning_rate": 0.00017228664877026265,
+      "loss": 0.673,
+      "step": 830
+    },
+    {
+      "epoch": 0.26592,
+      "grad_norm": 0.3625251310005681,
+      "learning_rate": 0.00017221498996164582,
+      "loss": 0.6612,
+      "step": 831
+    },
+    {
+      "epoch": 0.26624,
+      "grad_norm": 0.35030684269057144,
+      "learning_rate": 0.00017214325357206193,
+      "loss": 0.647,
+      "step": 832
+    },
+    {
+      "epoch": 0.26656,
+      "grad_norm": 0.3675875937136502,
+      "learning_rate": 0.00017207143967857777,
+      "loss": 0.7279,
+      "step": 833
+    },
+    {
+      "epoch": 0.26688,
+      "grad_norm": 0.3727898885861703,
+      "learning_rate": 0.00017199954835834337,
+      "loss": 0.7008,
+      "step": 834
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.3618226189281512,
+      "learning_rate": 0.00017192757968859202,
+      "loss": 0.7105,
+      "step": 835
+    },
+    {
+      "epoch": 0.26752,
+      "grad_norm": 0.394255699092009,
+      "learning_rate": 0.00017185553374664004,
+      "loss": 0.6932,
+      "step": 836
+    },
+    {
+      "epoch": 0.26784,
+      "grad_norm": 0.349870418299385,
+      "learning_rate": 0.00017178341060988678,
+      "loss": 0.7202,
+      "step": 837
+    },
+    {
+      "epoch": 0.26816,
+      "grad_norm": 0.3651099621762893,
+      "learning_rate": 0.0001717112103558146,
+      "loss": 0.7125,
+      "step": 838
+    },
+    {
+      "epoch": 0.26848,
+      "grad_norm": 0.36389602459689674,
+      "learning_rate": 0.00017163893306198854,
+      "loss": 0.7007,
+      "step": 839
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.36075341264175675,
+      "learning_rate": 0.00017156657880605653,
+      "loss": 0.6866,
+      "step": 840
+    },
+    {
+      "epoch": 0.26912,
+      "grad_norm": 0.3834109027245434,
+      "learning_rate": 0.00017149414766574918,
+      "loss": 0.6775,
+      "step": 841
+    },
+    {
+      "epoch": 0.26944,
+      "grad_norm": 0.3746030932157122,
+      "learning_rate": 0.00017142163971887965,
+      "loss": 0.6482,
+      "step": 842
+    },
+    {
+      "epoch": 0.26976,
+      "grad_norm": 0.356226388303307,
+      "learning_rate": 0.00017134905504334364,
+      "loss": 0.6423,
+      "step": 843
+    },
+    {
+      "epoch": 0.27008,
+      "grad_norm": 0.36012166278949104,
+      "learning_rate": 0.00017127639371711926,
+      "loss": 0.7024,
+      "step": 844
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.3654792778007879,
+      "learning_rate": 0.000171203655818267,
+      "loss": 0.7008,
+      "step": 845
+    },
+    {
+      "epoch": 0.27072,
+      "grad_norm": 0.37270068868360284,
+      "learning_rate": 0.0001711308414249295,
+      "loss": 0.7033,
+      "step": 846
+    },
+    {
+      "epoch": 0.27104,
+      "grad_norm": 0.34807063715927555,
+      "learning_rate": 0.00017105795061533183,
+      "loss": 0.6358,
+      "step": 847
+    },
+    {
+      "epoch": 0.27136,
+      "grad_norm": 0.393270880591959,
+      "learning_rate": 0.0001709849834677809,
+      "loss": 0.6791,
+      "step": 848
+    },
+    {
+      "epoch": 0.27168,
+      "grad_norm": 0.3655470453282021,
+      "learning_rate": 0.00017091194006066572,
+      "loss": 0.6456,
+      "step": 849
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3506513116310917,
+      "learning_rate": 0.0001708388204724572,
+      "loss": 0.682,
+      "step": 850
+    },
+    {
+      "epoch": 0.27232,
+      "grad_norm": 0.3851875616113911,
+      "learning_rate": 0.00017076562478170822,
+      "loss": 0.6612,
+      "step": 851
+    },
+    {
+      "epoch": 0.27264,
+      "grad_norm": 0.37361029451719346,
+      "learning_rate": 0.00017069235306705323,
+      "loss": 0.6621,
+      "step": 852
+    },
+    {
+      "epoch": 0.27296,
+      "grad_norm": 0.4018506569535499,
+      "learning_rate": 0.0001706190054072085,
+      "loss": 0.6689,
+      "step": 853
+    },
+    {
+      "epoch": 0.27328,
+      "grad_norm": 0.3574705065373127,
+      "learning_rate": 0.0001705455818809718,
+      "loss": 0.666,
+      "step": 854
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.37118008203529135,
+      "learning_rate": 0.00017047208256722244,
+      "loss": 0.6447,
+      "step": 855
+    },
+    {
+      "epoch": 0.27392,
+      "grad_norm": 0.37339442339511913,
+      "learning_rate": 0.00017039850754492112,
+      "loss": 0.6366,
+      "step": 856
+    },
+    {
+      "epoch": 0.27424,
+      "grad_norm": 0.36554648138335893,
+      "learning_rate": 0.00017032485689310998,
+      "loss": 0.6809,
+      "step": 857
+    },
+    {
+      "epoch": 0.27456,
+      "grad_norm": 0.3764822919216519,
+      "learning_rate": 0.00017025113069091223,
+      "loss": 0.6637,
+      "step": 858
+    },
+    {
+      "epoch": 0.27488,
+      "grad_norm": 0.4207942342818041,
+      "learning_rate": 0.0001701773290175324,
+      "loss": 0.7455,
+      "step": 859
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.34667589640154944,
+      "learning_rate": 0.00017010345195225598,
+      "loss": 0.6685,
+      "step": 860
+    },
+    {
+      "epoch": 0.27552,
+      "grad_norm": 0.3581586943653799,
+      "learning_rate": 0.0001700294995744496,
+      "loss": 0.663,
+      "step": 861
+    },
+    {
+      "epoch": 0.27584,
+      "grad_norm": 0.371326003370556,
+      "learning_rate": 0.00016995547196356066,
+      "loss": 0.6911,
+      "step": 862
+    },
+    {
+      "epoch": 0.27616,
+      "grad_norm": 0.36538386375152104,
+      "learning_rate": 0.0001698813691991174,
+      "loss": 0.6506,
+      "step": 863
+    },
+    {
+      "epoch": 0.27648,
+      "grad_norm": 0.36767576315495165,
+      "learning_rate": 0.00016980719136072892,
+      "loss": 0.7034,
+      "step": 864
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.37065161276198183,
+      "learning_rate": 0.00016973293852808486,
+      "loss": 0.6778,
+      "step": 865
+    },
+    {
+      "epoch": 0.27712,
+      "grad_norm": 0.38682531338216397,
+      "learning_rate": 0.00016965861078095537,
+      "loss": 0.6658,
+      "step": 866
+    },
+    {
+      "epoch": 0.27744,
+      "grad_norm": 0.3810259829824189,
+      "learning_rate": 0.00016958420819919128,
+      "loss": 0.6738,
+      "step": 867
+    },
+    {
+      "epoch": 0.27776,
+      "grad_norm": 0.3749059191738754,
+      "learning_rate": 0.00016950973086272365,
+      "loss": 0.7156,
+      "step": 868
+    },
+    {
+      "epoch": 0.27808,
+      "grad_norm": 0.3872215251605679,
+      "learning_rate": 0.00016943517885156386,
+      "loss": 0.6865,
+      "step": 869
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.3738462248754198,
+      "learning_rate": 0.0001693605522458036,
+      "loss": 0.6918,
+      "step": 870
+    },
+    {
+      "epoch": 0.27872,
+      "grad_norm": 0.36877661490834046,
+      "learning_rate": 0.00016928585112561465,
+      "loss": 0.6679,
+      "step": 871
+    },
+    {
+      "epoch": 0.27904,
+      "grad_norm": 0.3544422874787804,
+      "learning_rate": 0.00016921107557124883,
+      "loss": 0.6272,
+      "step": 872
+    },
+    {
+      "epoch": 0.27936,
+      "grad_norm": 0.3708189893062282,
+      "learning_rate": 0.0001691362256630379,
+      "loss": 0.6806,
+      "step": 873
+    },
+    {
+      "epoch": 0.27968,
+      "grad_norm": 0.380215015229633,
+      "learning_rate": 0.00016906130148139364,
+      "loss": 0.7079,
+      "step": 874
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.37096240233370503,
+      "learning_rate": 0.00016898630310680738,
+      "loss": 0.6733,
+      "step": 875
+    },
+    {
+      "epoch": 0.28032,
+      "grad_norm": 0.3941364301418198,
+      "learning_rate": 0.0001689112306198504,
+      "loss": 0.6839,
+      "step": 876
+    },
+    {
+      "epoch": 0.28064,
+      "grad_norm": 0.36432588768029983,
+      "learning_rate": 0.00016883608410117343,
+      "loss": 0.6583,
+      "step": 877
+    },
+    {
+      "epoch": 0.28096,
+      "grad_norm": 0.36934987086358184,
+      "learning_rate": 0.0001687608636315068,
+      "loss": 0.6532,
+      "step": 878
+    },
+    {
+      "epoch": 0.28128,
+      "grad_norm": 0.3694658537509994,
+      "learning_rate": 0.00016868556929166032,
+      "loss": 0.6482,
+      "step": 879
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3713959384678602,
+      "learning_rate": 0.0001686102011625231,
+      "loss": 0.6642,
+      "step": 880
+    },
+    {
+      "epoch": 0.28192,
+      "grad_norm": 0.421555137852422,
+      "learning_rate": 0.00016853475932506352,
+      "loss": 0.6305,
+      "step": 881
+    },
+    {
+      "epoch": 0.28224,
+      "grad_norm": 0.3815269263698826,
+      "learning_rate": 0.00016845924386032918,
+      "loss": 0.674,
+      "step": 882
+    },
+    {
+      "epoch": 0.28256,
+      "grad_norm": 0.3671879656089705,
+      "learning_rate": 0.0001683836548494468,
+      "loss": 0.6754,
+      "step": 883
+    },
+    {
+      "epoch": 0.28288,
+      "grad_norm": 0.33850983586535077,
+      "learning_rate": 0.00016830799237362203,
+      "loss": 0.6268,
+      "step": 884
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.3521345262892056,
+      "learning_rate": 0.00016823225651413953,
+      "loss": 0.6193,
+      "step": 885
+    },
+    {
+      "epoch": 0.28352,
+      "grad_norm": 0.6904724303293136,
+      "learning_rate": 0.00016815644735236268,
+      "loss": 0.6971,
+      "step": 886
+    },
+    {
+      "epoch": 0.28384,
+      "grad_norm": 0.3898409987607257,
+      "learning_rate": 0.0001680805649697338,
+      "loss": 0.6596,
+      "step": 887
+    },
+    {
+      "epoch": 0.28416,
+      "grad_norm": 0.36430433774492677,
+      "learning_rate": 0.0001680046094477737,
+      "loss": 0.6383,
+      "step": 888
+    },
+    {
+      "epoch": 0.28448,
+      "grad_norm": 0.4237870824617201,
+      "learning_rate": 0.00016792858086808177,
+      "loss": 0.6757,
+      "step": 889
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3792898105886306,
+      "learning_rate": 0.00016785247931233602,
+      "loss": 0.672,
+      "step": 890
+    },
+    {
+      "epoch": 0.28512,
+      "grad_norm": 0.4005473857322007,
+      "learning_rate": 0.00016777630486229273,
+      "loss": 0.7246,
+      "step": 891
+    },
+    {
+      "epoch": 0.28544,
+      "grad_norm": 0.3784335381702001,
+      "learning_rate": 0.00016770005759978655,
+      "loss": 0.6801,
+      "step": 892
+    },
+    {
+      "epoch": 0.28576,
+      "grad_norm": 0.3661674946140045,
+      "learning_rate": 0.00016762373760673035,
+      "loss": 0.7091,
+      "step": 893
+    },
+    {
+      "epoch": 0.28608,
+      "grad_norm": 0.35777449313307796,
+      "learning_rate": 0.00016754734496511514,
+      "loss": 0.6189,
+      "step": 894
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.35590285851087305,
+      "learning_rate": 0.00016747087975700988,
+      "loss": 0.6969,
+      "step": 895
+    },
+    {
+      "epoch": 0.28672,
+      "grad_norm": 0.402997502242191,
+      "learning_rate": 0.00016739434206456167,
+      "loss": 0.6836,
+      "step": 896
+    },
+    {
+      "epoch": 0.28704,
+      "grad_norm": 0.3859134510340487,
+      "learning_rate": 0.00016731773196999533,
+      "loss": 0.7321,
+      "step": 897
+    },
+    {
+      "epoch": 0.28736,
+      "grad_norm": 0.35399634339455766,
+      "learning_rate": 0.00016724104955561354,
+      "loss": 0.64,
+      "step": 898
+    },
+    {
+      "epoch": 0.28768,
+      "grad_norm": 0.35460270204765015,
+      "learning_rate": 0.0001671642949037966,
+      "loss": 0.6796,
+      "step": 899
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3577821876725225,
+      "learning_rate": 0.0001670874680970025,
+      "loss": 0.6261,
+      "step": 900
+    },
+    {
+      "epoch": 0.28832,
+      "grad_norm": 0.3772616328756418,
+      "learning_rate": 0.0001670105692177667,
+      "loss": 0.6755,
+      "step": 901
+    },
+    {
+      "epoch": 0.28864,
+      "grad_norm": 0.380094153877933,
+      "learning_rate": 0.00016693359834870207,
+      "loss": 0.6484,
+      "step": 902
+    },
+    {
+      "epoch": 0.28896,
+      "grad_norm": 0.3665405454149139,
+      "learning_rate": 0.00016685655557249887,
+      "loss": 0.6946,
+      "step": 903
+    },
+    {
+      "epoch": 0.28928,
+      "grad_norm": 0.3861010972957807,
+      "learning_rate": 0.0001667794409719246,
+      "loss": 0.6929,
+      "step": 904
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.3926124592299394,
+      "learning_rate": 0.00016670225462982386,
+      "loss": 0.6724,
+      "step": 905
+    },
+    {
+      "epoch": 0.28992,
+      "grad_norm": 0.3523672322317073,
+      "learning_rate": 0.0001666249966291184,
+      "loss": 0.6467,
+      "step": 906
+    },
+    {
+      "epoch": 0.29024,
+      "grad_norm": 0.3602395304708209,
+      "learning_rate": 0.00016654766705280694,
+      "loss": 0.6229,
+      "step": 907
+    },
+    {
+      "epoch": 0.29056,
+      "grad_norm": 0.3999067550893319,
+      "learning_rate": 0.00016647026598396505,
+      "loss": 0.6553,
+      "step": 908
+    },
+    {
+      "epoch": 0.29088,
+      "grad_norm": 0.3618976583977307,
+      "learning_rate": 0.0001663927935057451,
+      "loss": 0.6022,
+      "step": 909
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3643539203957328,
+      "learning_rate": 0.0001663152497013763,
+      "loss": 0.6918,
+      "step": 910
+    },
+    {
+      "epoch": 0.29152,
+      "grad_norm": 0.38437017782753297,
+      "learning_rate": 0.00016623763465416425,
+      "loss": 0.6706,
+      "step": 911
+    },
+    {
+      "epoch": 0.29184,
+      "grad_norm": 0.39873148741296155,
+      "learning_rate": 0.0001661599484474914,
+      "loss": 0.7074,
+      "step": 912
+    },
+    {
+      "epoch": 0.29216,
+      "grad_norm": 0.3949016272609718,
+      "learning_rate": 0.0001660821911648163,
+      "loss": 0.7474,
+      "step": 913
+    },
+    {
+      "epoch": 0.29248,
+      "grad_norm": 0.3494489891991421,
+      "learning_rate": 0.00016600436288967418,
+      "loss": 0.6632,
+      "step": 914
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.3501319837881801,
+      "learning_rate": 0.0001659264637056763,
+      "loss": 0.6304,
+      "step": 915
+    },
+    {
+      "epoch": 0.29312,
+      "grad_norm": 0.3527166694363339,
+      "learning_rate": 0.00016584849369651026,
+      "loss": 0.6397,
+      "step": 916
+    },
+    {
+      "epoch": 0.29344,
+      "grad_norm": 0.381420613678227,
+      "learning_rate": 0.00016577045294593958,
+      "loss": 0.6531,
+      "step": 917
+    },
+    {
+      "epoch": 0.29376,
+      "grad_norm": 0.36213841323932616,
+      "learning_rate": 0.00016569234153780395,
+      "loss": 0.6513,
+      "step": 918
+    },
+    {
+      "epoch": 0.29408,
+      "grad_norm": 0.3623787905356813,
+      "learning_rate": 0.00016561415955601886,
+      "loss": 0.6275,
+      "step": 919
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.35633437654990074,
+      "learning_rate": 0.0001655359070845757,
+      "loss": 0.6678,
+      "step": 920
+    },
+    {
+      "epoch": 0.29472,
+      "grad_norm": 0.3746090132247638,
+      "learning_rate": 0.00016545758420754146,
+      "loss": 0.6892,
+      "step": 921
+    },
+    {
+      "epoch": 0.29504,
+      "grad_norm": 0.3425791549577302,
+      "learning_rate": 0.0001653791910090589,
+      "loss": 0.5937,
+      "step": 922
+    },
+    {
+      "epoch": 0.29536,
+      "grad_norm": 0.36255434070489573,
+      "learning_rate": 0.00016530072757334625,
+      "loss": 0.6745,
+      "step": 923
+    },
+    {
+      "epoch": 0.29568,
+      "grad_norm": 0.3776061396002599,
+      "learning_rate": 0.00016522219398469723,
+      "loss": 0.6909,
+      "step": 924
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.38002185151870216,
+      "learning_rate": 0.00016514359032748088,
+      "loss": 0.6561,
+      "step": 925
+    },
+    {
+      "epoch": 0.29632,
+      "grad_norm": 0.37971148165006247,
+      "learning_rate": 0.0001650649166861416,
+      "loss": 0.7082,
+      "step": 926
+    },
+    {
+      "epoch": 0.29664,
+      "grad_norm": 0.36269364908054363,
+      "learning_rate": 0.00016498617314519886,
+      "loss": 0.6385,
+      "step": 927
+    },
+    {
+      "epoch": 0.29696,
+      "grad_norm": 0.3599052242365175,
+      "learning_rate": 0.00016490735978924733,
+      "loss": 0.6906,
+      "step": 928
+    },
+    {
+      "epoch": 0.29728,
+      "grad_norm": 0.351768734324277,
+      "learning_rate": 0.00016482847670295665,
+      "loss": 0.6703,
+      "step": 929
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3812060990620126,
+      "learning_rate": 0.00016474952397107134,
+      "loss": 0.6426,
+      "step": 930
+    },
+    {
+      "epoch": 0.29792,
+      "grad_norm": 0.35627487037143313,
+      "learning_rate": 0.00016467050167841074,
+      "loss": 0.6475,
+      "step": 931
+    },
+    {
+      "epoch": 0.29824,
+      "grad_norm": 0.3646915738130938,
+      "learning_rate": 0.00016459140990986894,
+      "loss": 0.7014,
+      "step": 932
+    },
+    {
+      "epoch": 0.29856,
+      "grad_norm": 0.3771235652915896,
+      "learning_rate": 0.0001645122487504147,
+      "loss": 0.7274,
+      "step": 933
+    },
+    {
+      "epoch": 0.29888,
+      "grad_norm": 0.353189475012315,
+      "learning_rate": 0.0001644330182850913,
+      "loss": 0.5989,
+      "step": 934
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.37873884947495684,
+      "learning_rate": 0.00016435371859901645,
+      "loss": 0.735,
+      "step": 935
+    },
+    {
+      "epoch": 0.29952,
+      "grad_norm": 0.3794924661892233,
+      "learning_rate": 0.00016427434977738225,
+      "loss": 0.7076,
+      "step": 936
+    },
+    {
+      "epoch": 0.29984,
+      "grad_norm": 0.35027737099186884,
+      "learning_rate": 0.00016419491190545509,
+      "loss": 0.6959,
+      "step": 937
+    },
+    {
+      "epoch": 0.30016,
+      "grad_norm": 0.33591022962403055,
+      "learning_rate": 0.00016411540506857547,
+      "loss": 0.6204,
+      "step": 938
+    },
+    {
+      "epoch": 0.30048,
+      "grad_norm": 0.3539505955772323,
+      "learning_rate": 0.0001640358293521581,
+      "loss": 0.656,
+      "step": 939
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3617125872942655,
+      "learning_rate": 0.0001639561848416915,
+      "loss": 0.6451,
+      "step": 940
+    },
+    {
+      "epoch": 0.30112,
+      "grad_norm": 0.37123614005159844,
+      "learning_rate": 0.00016387647162273837,
+      "loss": 0.7278,
+      "step": 941
+    },
+    {
+      "epoch": 0.30144,
+      "grad_norm": 0.3725088941668062,
+      "learning_rate": 0.00016379668978093491,
+      "loss": 0.6451,
+      "step": 942
+    },
+    {
+      "epoch": 0.30176,
+      "grad_norm": 0.34152340573341006,
+      "learning_rate": 0.00016371683940199133,
+      "loss": 0.6771,
+      "step": 943
+    },
+    {
+      "epoch": 0.30208,
+      "grad_norm": 0.3548490178109378,
+      "learning_rate": 0.00016363692057169124,
+      "loss": 0.6843,
+      "step": 944
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.35343157514639656,
+      "learning_rate": 0.00016355693337589196,
+      "loss": 0.688,
+      "step": 945
+    },
+    {
+      "epoch": 0.30272,
+      "grad_norm": 0.3640587759348646,
+      "learning_rate": 0.00016347687790052416,
+      "loss": 0.6717,
+      "step": 946
+    },
+    {
+      "epoch": 0.30304,
+      "grad_norm": 0.36144577101370634,
+      "learning_rate": 0.00016339675423159182,
+      "loss": 0.6268,
+      "step": 947
+    },
+    {
+      "epoch": 0.30336,
+      "grad_norm": 0.380090566499197,
+      "learning_rate": 0.0001633165624551723,
+      "loss": 0.6831,
+      "step": 948
+    },
+    {
+      "epoch": 0.30368,
+      "grad_norm": 0.3769643347765158,
+      "learning_rate": 0.0001632363026574161,
+      "loss": 0.6858,
+      "step": 949
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.37556314716044387,
+      "learning_rate": 0.00016315597492454672,
+      "loss": 0.6933,
+      "step": 950
+    },
+    {
+      "epoch": 0.30432,
+      "grad_norm": 0.3680280472287063,
+      "learning_rate": 0.0001630755793428607,
+      "loss": 0.6713,
+      "step": 951
+    },
+    {
+      "epoch": 0.30464,
+      "grad_norm": 0.36623019970443227,
+      "learning_rate": 0.00016299511599872753,
+      "loss": 0.7097,
+      "step": 952
+    },
+    {
+      "epoch": 0.30496,
+      "grad_norm": 0.3744260530128633,
+      "learning_rate": 0.0001629145849785893,
+      "loss": 0.6656,
+      "step": 953
+    },
+    {
+      "epoch": 0.30528,
+      "grad_norm": 0.3609342240323625,
+      "learning_rate": 0.00016283398636896107,
+      "loss": 0.6421,
+      "step": 954
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.35399280203812694,
+      "learning_rate": 0.00016275332025643028,
+      "loss": 0.6587,
+      "step": 955
+    },
+    {
+      "epoch": 0.30592,
+      "grad_norm": 0.3698257987725609,
+      "learning_rate": 0.000162672586727657,
+      "loss": 0.6756,
+      "step": 956
+    },
+    {
+      "epoch": 0.30624,
+      "grad_norm": 0.36418119602728144,
+      "learning_rate": 0.00016259178586937377,
+      "loss": 0.6538,
+      "step": 957
+    },
+    {
+      "epoch": 0.30656,
+      "grad_norm": 0.35738670930122973,
+      "learning_rate": 0.00016251091776838536,
+      "loss": 0.6733,
+      "step": 958
+    },
+    {
+      "epoch": 0.30688,
+      "grad_norm": 0.3883558384165783,
+      "learning_rate": 0.00016242998251156883,
+      "loss": 0.6671,
+      "step": 959
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.36084244676274785,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 0.6477,
+      "step": 960
+    },
+    {
+      "epoch": 0.30752,
+      "grad_norm": 0.35322104987763,
+      "learning_rate": 0.00016226791087832024,
+      "loss": 0.6534,
+      "step": 961
+    },
+    {
+      "epoch": 0.30784,
+      "grad_norm": 0.36973136564250303,
+      "learning_rate": 0.00016218677467600264,
+      "loss": 0.6583,
+      "step": 962
+    },
+    {
+      "epoch": 0.30816,
+      "grad_norm": 0.3824178268494372,
+      "learning_rate": 0.00016210557166608562,
+      "loss": 0.6576,
+      "step": 963
+    },
+    {
+      "epoch": 0.30848,
+      "grad_norm": 0.3411237488526102,
+      "learning_rate": 0.0001620243019358061,
+      "loss": 0.684,
+      "step": 964
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.37440464927829414,
+      "learning_rate": 0.00016194296557247255,
+      "loss": 0.652,
+      "step": 965
+    },
+    {
+      "epoch": 0.30912,
+      "grad_norm": 0.35787823750772685,
+      "learning_rate": 0.0001618615626634651,
+      "loss": 0.6943,
+      "step": 966
+    },
+    {
+      "epoch": 0.30944,
+      "grad_norm": 0.3579971211954716,
+      "learning_rate": 0.00016178009329623533,
+      "loss": 0.6607,
+      "step": 967
+    },
+    {
+      "epoch": 0.30976,
+      "grad_norm": 0.37491312284581807,
+      "learning_rate": 0.00016169855755830627,
+      "loss": 0.7079,
+      "step": 968
+    },
+    {
+      "epoch": 0.31008,
+      "grad_norm": 0.3758810732262012,
+      "learning_rate": 0.0001616169555372722,
+      "loss": 0.696,
+      "step": 969
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3685168908904295,
+      "learning_rate": 0.0001615352873207986,
+      "loss": 0.6437,
+      "step": 970
+    },
+    {
+      "epoch": 0.31072,
+      "grad_norm": 0.3472159295621595,
+      "learning_rate": 0.00016145355299662211,
+      "loss": 0.625,
+      "step": 971
+    },
+    {
+      "epoch": 0.31104,
+      "grad_norm": 0.3760137565048465,
+      "learning_rate": 0.0001613717526525504,
+      "loss": 0.663,
+      "step": 972
+    },
+    {
+      "epoch": 0.31136,
+      "grad_norm": 0.3705790528335316,
+      "learning_rate": 0.00016128988637646204,
+      "loss": 0.6884,
+      "step": 973
+    },
+    {
+      "epoch": 0.31168,
+      "grad_norm": 0.37541867430056264,
+      "learning_rate": 0.00016120795425630634,
+      "loss": 0.6854,
+      "step": 974
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.38349964714380863,
+      "learning_rate": 0.00016112595638010353,
+      "loss": 0.6917,
+      "step": 975
+    },
+    {
+      "epoch": 0.31232,
+      "grad_norm": 0.3640848867841372,
+      "learning_rate": 0.00016104389283594435,
+      "loss": 0.633,
+      "step": 976
+    },
+    {
+      "epoch": 0.31264,
+      "grad_norm": 0.36504894306850255,
+      "learning_rate": 0.00016096176371199015,
+      "loss": 0.6975,
+      "step": 977
+    },
+    {
+      "epoch": 0.31296,
+      "grad_norm": 0.3743788002825349,
+      "learning_rate": 0.00016087956909647264,
+      "loss": 0.6332,
+      "step": 978
+    },
+    {
+      "epoch": 0.31328,
+      "grad_norm": 0.38754508345126265,
+      "learning_rate": 0.000160797309077694,
+      "loss": 0.6625,
+      "step": 979
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3982264654764306,
+      "learning_rate": 0.00016071498374402665,
+      "loss": 0.735,
+      "step": 980
+    },
+    {
+      "epoch": 0.31392,
+      "grad_norm": 0.364892912346316,
+      "learning_rate": 0.00016063259318391314,
+      "loss": 0.6332,
+      "step": 981
+    },
+    {
+      "epoch": 0.31424,
+      "grad_norm": 0.3603837381919519,
+      "learning_rate": 0.00016055013748586606,
+      "loss": 0.6539,
+      "step": 982
+    },
+    {
+      "epoch": 0.31456,
+      "grad_norm": 0.40015024801271154,
+      "learning_rate": 0.0001604676167384681,
+      "loss": 0.6703,
+      "step": 983
+    },
+    {
+      "epoch": 0.31488,
+      "grad_norm": 0.35001951598663866,
+      "learning_rate": 0.0001603850310303717,
+      "loss": 0.6098,
+      "step": 984
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3668523738327645,
+      "learning_rate": 0.00016030238045029916,
+      "loss": 0.645,
+      "step": 985
+    },
+    {
+      "epoch": 0.31552,
+      "grad_norm": 0.35403577782505474,
+      "learning_rate": 0.00016021966508704253,
+      "loss": 0.6603,
+      "step": 986
+    },
+    {
+      "epoch": 0.31584,
+      "grad_norm": 0.3789129050894694,
+      "learning_rate": 0.0001601368850294633,
+      "loss": 0.6628,
+      "step": 987
+    },
+    {
+      "epoch": 0.31616,
+      "grad_norm": 0.4647907425339079,
+      "learning_rate": 0.00016005404036649256,
+      "loss": 0.6876,
+      "step": 988
+    },
+    {
+      "epoch": 0.31648,
+      "grad_norm": 0.3594507397418406,
+      "learning_rate": 0.00015997113118713086,
+      "loss": 0.5998,
+      "step": 989
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3652042297314683,
+      "learning_rate": 0.00015988815758044792,
+      "loss": 0.6838,
+      "step": 990
+    },
+    {
+      "epoch": 0.31712,
+      "grad_norm": 0.3729665870482187,
+      "learning_rate": 0.00015980511963558278,
+      "loss": 0.6495,
+      "step": 991
+    },
+    {
+      "epoch": 0.31744,
+      "grad_norm": 0.36933924818636055,
+      "learning_rate": 0.00015972201744174352,
+      "loss": 0.6757,
+      "step": 992
+    },
+    {
+      "epoch": 0.31776,
+      "grad_norm": 0.38679551637565257,
+      "learning_rate": 0.00015963885108820743,
+      "loss": 0.6541,
+      "step": 993
+    },
+    {
+      "epoch": 0.31808,
+      "grad_norm": 0.3603785868451744,
+      "learning_rate": 0.00015955562066432042,
+      "loss": 0.6163,
+      "step": 994
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.3929824324946899,
+      "learning_rate": 0.0001594723262594975,
+      "loss": 0.6849,
+      "step": 995
+    },
+    {
+      "epoch": 0.31872,
+      "grad_norm": 0.3559659322460444,
+      "learning_rate": 0.0001593889679632223,
+      "loss": 0.6379,
+      "step": 996
+    },
+    {
+      "epoch": 0.31904,
+      "grad_norm": 0.37883766953321385,
+      "learning_rate": 0.00015930554586504706,
+      "loss": 0.6729,
+      "step": 997
+    },
+    {
+      "epoch": 0.31936,
+      "grad_norm": 0.3729989821806419,
+      "learning_rate": 0.00015922206005459266,
+      "loss": 0.6889,
+      "step": 998
+    },
+    {
+      "epoch": 0.31968,
+      "grad_norm": 0.35918826467125653,
+      "learning_rate": 0.00015913851062154835,
+      "loss": 0.5814,
+      "step": 999
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.35906920939136566,
+      "learning_rate": 0.00015905489765567172,
+      "loss": 0.6608,
+      "step": 1000
+    },
+    {
+      "epoch": 0.32032,
+      "grad_norm": 0.383067845796615,
+      "learning_rate": 0.0001589712212467887,
+      "loss": 0.6538,
+      "step": 1001
+    },
+    {
+      "epoch": 0.32064,
+      "grad_norm": 0.39490816143678525,
+      "learning_rate": 0.00015888748148479328,
+      "loss": 0.7145,
+      "step": 1002
+    },
+    {
+      "epoch": 0.32096,
+      "grad_norm": 0.3649317674576256,
+      "learning_rate": 0.0001588036784596476,
+      "loss": 0.6571,
+      "step": 1003
+    },
+    {
+      "epoch": 0.32128,
+      "grad_norm": 0.3522141223196636,
+      "learning_rate": 0.00015871981226138173,
+      "loss": 0.6314,
+      "step": 1004
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.3737060606617921,
+      "learning_rate": 0.00015863588298009352,
+      "loss": 0.6448,
+      "step": 1005
+    },
+    {
+      "epoch": 0.32192,
+      "grad_norm": 0.4006255670960862,
+      "learning_rate": 0.00015855189070594866,
+      "loss": 0.6446,
+      "step": 1006
+    },
+    {
+      "epoch": 0.32224,
+      "grad_norm": 0.3788146630252767,
+      "learning_rate": 0.00015846783552918062,
+      "loss": 0.6669,
+      "step": 1007
+    },
+    {
+      "epoch": 0.32256,
+      "grad_norm": 0.3353138151804068,
+      "learning_rate": 0.00015838371754009028,
+      "loss": 0.6304,
+      "step": 1008
+    },
+    {
+      "epoch": 0.32288,
+      "grad_norm": 0.37891280362065266,
+      "learning_rate": 0.00015829953682904607,
+      "loss": 0.6662,
+      "step": 1009
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.36893994601189634,
+      "learning_rate": 0.0001582152934864838,
+      "loss": 0.6723,
+      "step": 1010
+    },
+    {
+      "epoch": 0.32352,
+      "grad_norm": 0.3718370779966313,
+      "learning_rate": 0.00015813098760290658,
+      "loss": 0.6795,
+      "step": 1011
+    },
+    {
+      "epoch": 0.32384,
+      "grad_norm": 0.37614222168808964,
+      "learning_rate": 0.00015804661926888466,
+      "loss": 0.689,
+      "step": 1012
+    },
+    {
+      "epoch": 0.32416,
+      "grad_norm": 0.35470906810610897,
+      "learning_rate": 0.00015796218857505546,
+      "loss": 0.6787,
+      "step": 1013
+    },
+    {
+      "epoch": 0.32448,
+      "grad_norm": 0.3868230064365295,
+      "learning_rate": 0.0001578776956121233,
+      "loss": 0.6919,
+      "step": 1014
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.3584080977191767,
+      "learning_rate": 0.00015779314047085946,
+      "loss": 0.649,
+      "step": 1015
+    },
+    {
+      "epoch": 0.32512,
+      "grad_norm": 0.3581072006980192,
+      "learning_rate": 0.00015770852324210202,
+      "loss": 0.6889,
+      "step": 1016
+    },
+    {
+      "epoch": 0.32544,
+      "grad_norm": 0.362762782023143,
+      "learning_rate": 0.00015762384401675567,
+      "loss": 0.6816,
+      "step": 1017
+    },
+    {
+      "epoch": 0.32576,
+      "grad_norm": 0.371193053002529,
+      "learning_rate": 0.00015753910288579184,
+      "loss": 0.6565,
+      "step": 1018
+    },
+    {
+      "epoch": 0.32608,
+      "grad_norm": 0.38115451431628883,
+      "learning_rate": 0.0001574542999402484,
+      "loss": 0.6811,
+      "step": 1019
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.36106395785399764,
+      "learning_rate": 0.00015736943527122963,
+      "loss": 0.661,
+      "step": 1020
+    },
+    {
+      "epoch": 0.32672,
+      "grad_norm": 0.35735851646054667,
+      "learning_rate": 0.00015728450896990606,
+      "loss": 0.6663,
+      "step": 1021
+    },
+    {
+      "epoch": 0.32704,
+      "grad_norm": 0.3623931389838278,
+      "learning_rate": 0.0001571995211275146,
+      "loss": 0.7253,
+      "step": 1022
+    },
+    {
+      "epoch": 0.32736,
+      "grad_norm": 0.35506019809511985,
+      "learning_rate": 0.00015711447183535806,
+      "loss": 0.618,
+      "step": 1023
+    },
+    {
+      "epoch": 0.32768,
+      "grad_norm": 0.3489857022479348,
+      "learning_rate": 0.0001570293611848054,
+      "loss": 0.6222,
+      "step": 1024
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.37680837981183957,
+      "learning_rate": 0.00015694418926729146,
+      "loss": 0.6799,
+      "step": 1025
+    },
+    {
+      "epoch": 0.32832,
+      "grad_norm": 0.36911157265370625,
+      "learning_rate": 0.00015685895617431695,
+      "loss": 0.6583,
+      "step": 1026
+    },
+    {
+      "epoch": 0.32864,
+      "grad_norm": 0.36159265607345803,
+      "learning_rate": 0.0001567736619974482,
+      "loss": 0.661,
+      "step": 1027
+    },
+    {
+      "epoch": 0.32896,
+      "grad_norm": 0.36901441587991685,
+      "learning_rate": 0.00015668830682831724,
+      "loss": 0.6655,
+      "step": 1028
+    },
+    {
+      "epoch": 0.32928,
+      "grad_norm": 0.36093552394155415,
+      "learning_rate": 0.00015660289075862164,
+      "loss": 0.5978,
+      "step": 1029
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.37342315971667994,
+      "learning_rate": 0.00015651741388012432,
+      "loss": 0.6677,
+      "step": 1030
+    },
+    {
+      "epoch": 0.32992,
+      "grad_norm": 0.4098215899518884,
+      "learning_rate": 0.0001564318762846536,
+      "loss": 0.6449,
+      "step": 1031
+    },
+    {
+      "epoch": 0.33024,
+      "grad_norm": 0.4050113949889007,
+      "learning_rate": 0.00015634627806410296,
+      "loss": 0.6834,
+      "step": 1032
+    },
+    {
+      "epoch": 0.33056,
+      "grad_norm": 0.3539258347240515,
+      "learning_rate": 0.00015626061931043106,
+      "loss": 0.6924,
+      "step": 1033
+    },
+    {
+      "epoch": 0.33088,
+      "grad_norm": 0.3806068426402795,
+      "learning_rate": 0.0001561749001156616,
+      "loss": 0.6653,
+      "step": 1034
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.36582037767946063,
+      "learning_rate": 0.00015608912057188317,
+      "loss": 0.639,
+      "step": 1035
+    },
+    {
+      "epoch": 0.33152,
+      "grad_norm": 0.370538404130026,
+      "learning_rate": 0.0001560032807712492,
+      "loss": 0.6589,
+      "step": 1036
+    },
+    {
+      "epoch": 0.33184,
+      "grad_norm": 0.366378492392802,
+      "learning_rate": 0.0001559173808059779,
+      "loss": 0.6549,
+      "step": 1037
+    },
+    {
+      "epoch": 0.33216,
+      "grad_norm": 0.3522169174480128,
+      "learning_rate": 0.00015583142076835204,
+      "loss": 0.6711,
+      "step": 1038
+    },
+    {
+      "epoch": 0.33248,
+      "grad_norm": 0.34943426259617355,
+      "learning_rate": 0.000155745400750719,
+      "loss": 0.6653,
+      "step": 1039
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.36946892742706056,
+      "learning_rate": 0.00015565932084549058,
+      "loss": 0.6565,
+      "step": 1040
+    },
+    {
+      "epoch": 0.33312,
+      "grad_norm": 0.3793852309877599,
+      "learning_rate": 0.00015557318114514285,
+      "loss": 0.675,
+      "step": 1041
+    },
+    {
+      "epoch": 0.33344,
+      "grad_norm": 0.36173698458850406,
+      "learning_rate": 0.00015548698174221626,
+      "loss": 0.6613,
+      "step": 1042
+    },
+    {
+      "epoch": 0.33376,
+      "grad_norm": 0.33469803714633256,
+      "learning_rate": 0.00015540072272931518,
+      "loss": 0.6195,
+      "step": 1043
+    },
+    {
+      "epoch": 0.33408,
+      "grad_norm": 0.36164971660089984,
+      "learning_rate": 0.00015531440419910828,
+      "loss": 0.6679,
+      "step": 1044
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3639937211916124,
+      "learning_rate": 0.00015522802624432796,
+      "loss": 0.6121,
+      "step": 1045
+    },
+    {
+      "epoch": 0.33472,
+      "grad_norm": 0.37109129267506613,
+      "learning_rate": 0.00015514158895777054,
+      "loss": 0.6926,
+      "step": 1046
+    },
+    {
+      "epoch": 0.33504,
+      "grad_norm": 0.3714485316690472,
+      "learning_rate": 0.00015505509243229614,
+      "loss": 0.6987,
+      "step": 1047
+    },
+    {
+      "epoch": 0.33536,
+      "grad_norm": 0.39142606845217615,
+      "learning_rate": 0.0001549685367608284,
+      "loss": 0.7089,
+      "step": 1048
+    },
+    {
+      "epoch": 0.33568,
+      "grad_norm": 0.36259030690561067,
+      "learning_rate": 0.0001548819220363546,
+      "loss": 0.6253,
+      "step": 1049
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.35952691110741264,
+      "learning_rate": 0.0001547952483519254,
+      "loss": 0.6555,
+      "step": 1050
+    },
+    {
+      "epoch": 0.33632,
+      "grad_norm": 0.3548597404926695,
+      "learning_rate": 0.0001547085158006548,
+      "loss": 0.6308,
+      "step": 1051
+    },
+    {
+      "epoch": 0.33664,
+      "grad_norm": 0.38402158450852814,
+      "learning_rate": 0.00015462172447572013,
+      "loss": 0.634,
+      "step": 1052
+    },
+    {
+      "epoch": 0.33696,
+      "grad_norm": 0.34441548175393877,
+      "learning_rate": 0.00015453487447036172,
+      "loss": 0.6589,
+      "step": 1053
+    },
+    {
+      "epoch": 0.33728,
+      "grad_norm": 0.344323551106791,
+      "learning_rate": 0.00015444796587788307,
+      "loss": 0.6249,
+      "step": 1054
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.38643166482499186,
+      "learning_rate": 0.00015436099879165055,
+      "loss": 0.7023,
+      "step": 1055
+    },
+    {
+      "epoch": 0.33792,
+      "grad_norm": 0.3563266249988576,
+      "learning_rate": 0.0001542739733050934,
+      "loss": 0.6786,
+      "step": 1056
+    },
+    {
+      "epoch": 0.33824,
+      "grad_norm": 0.35397941503812475,
+      "learning_rate": 0.00015418688951170356,
+      "loss": 0.6575,
+      "step": 1057
+    },
+    {
+      "epoch": 0.33856,
+      "grad_norm": 0.3790624743469411,
+      "learning_rate": 0.00015409974750503564,
+      "loss": 0.7061,
+      "step": 1058
+    },
+    {
+      "epoch": 0.33888,
+      "grad_norm": 0.3574400160617647,
+      "learning_rate": 0.00015401254737870682,
+      "loss": 0.6708,
+      "step": 1059
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.36844606935547664,
+      "learning_rate": 0.00015392528922639662,
+      "loss": 0.6051,
+      "step": 1060
+    },
+    {
+      "epoch": 0.33952,
+      "grad_norm": 0.37658332480738516,
+      "learning_rate": 0.00015383797314184704,
+      "loss": 0.6514,
+      "step": 1061
+    },
+    {
+      "epoch": 0.33984,
+      "grad_norm": 0.38675895372917346,
+      "learning_rate": 0.00015375059921886213,
+      "loss": 0.6716,
+      "step": 1062
+    },
+    {
+      "epoch": 0.34016,
+      "grad_norm": 0.3421617167538485,
+      "learning_rate": 0.00015366316755130829,
+      "loss": 0.6229,
+      "step": 1063
+    },
+    {
+      "epoch": 0.34048,
+      "grad_norm": 0.35342929523103755,
+      "learning_rate": 0.0001535756782331138,
+      "loss": 0.6561,
+      "step": 1064
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.36779895789196637,
+      "learning_rate": 0.00015348813135826893,
+      "loss": 0.675,
+      "step": 1065
+    },
+    {
+      "epoch": 0.34112,
+      "grad_norm": 0.361010410788992,
+      "learning_rate": 0.00015340052702082576,
+      "loss": 0.6069,
+      "step": 1066
+    },
+    {
+      "epoch": 0.34144,
+      "grad_norm": 0.34595375035287296,
+      "learning_rate": 0.00015331286531489817,
+      "loss": 0.6734,
+      "step": 1067
+    },
+    {
+      "epoch": 0.34176,
+      "grad_norm": 0.3483443428908466,
+      "learning_rate": 0.00015322514633466154,
+      "loss": 0.642,
+      "step": 1068
+    },
+    {
+      "epoch": 0.34208,
+      "grad_norm": 0.3955374802065069,
+      "learning_rate": 0.00015313737017435294,
+      "loss": 0.7165,
+      "step": 1069
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.37830914007613364,
+      "learning_rate": 0.00015304953692827074,
+      "loss": 0.6725,
+      "step": 1070
+    },
+    {
+      "epoch": 0.34272,
+      "grad_norm": 0.36273349137274824,
+      "learning_rate": 0.0001529616466907747,
+      "loss": 0.6709,
+      "step": 1071
+    },
+    {
+      "epoch": 0.34304,
+      "grad_norm": 0.34624603419985345,
+      "learning_rate": 0.0001528736995562858,
+      "loss": 0.6273,
+      "step": 1072
+    },
+    {
+      "epoch": 0.34336,
+      "grad_norm": 0.3648264327047193,
+      "learning_rate": 0.00015278569561928614,
+      "loss": 0.6467,
+      "step": 1073
+    },
+    {
+      "epoch": 0.34368,
+      "grad_norm": 0.37493698819062815,
+      "learning_rate": 0.00015269763497431882,
+      "loss": 0.6715,
+      "step": 1074
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.3777423462146128,
+      "learning_rate": 0.0001526095177159879,
+      "loss": 0.6988,
+      "step": 1075
+    },
+    {
+      "epoch": 0.34432,
+      "grad_norm": 0.34828002107038014,
+      "learning_rate": 0.00015252134393895826,
+      "loss": 0.6032,
+      "step": 1076
+    },
+    {
+      "epoch": 0.34464,
+      "grad_norm": 0.3711510117528022,
+      "learning_rate": 0.0001524331137379554,
+      "loss": 0.6086,
+      "step": 1077
+    },
+    {
+      "epoch": 0.34496,
+      "grad_norm": 0.36452581357905783,
+      "learning_rate": 0.00015234482720776564,
+      "loss": 0.6448,
+      "step": 1078
+    },
+    {
+      "epoch": 0.34528,
+      "grad_norm": 0.3980780639622504,
+      "learning_rate": 0.0001522564844432356,
+      "loss": 0.6608,
+      "step": 1079
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.40193983367291936,
+      "learning_rate": 0.00015216808553927247,
+      "loss": 0.6543,
+      "step": 1080
+    },
+    {
+      "epoch": 0.34592,
+      "grad_norm": 0.3447463363113517,
+      "learning_rate": 0.00015207963059084357,
+      "loss": 0.6593,
+      "step": 1081
+    },
+    {
+      "epoch": 0.34624,
+      "grad_norm": 0.35587151155889213,
+      "learning_rate": 0.00015199111969297672,
+      "loss": 0.6565,
+      "step": 1082
+    },
+    {
+      "epoch": 0.34656,
+      "grad_norm": 0.3657730208797738,
+      "learning_rate": 0.00015190255294075951,
+      "loss": 0.642,
+      "step": 1083
+    },
+    {
+      "epoch": 0.34688,
+      "grad_norm": 0.3761834500421161,
+      "learning_rate": 0.00015181393042933982,
+      "loss": 0.7161,
+      "step": 1084
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.368373353833742,
+      "learning_rate": 0.00015172525225392526,
+      "loss": 0.6403,
+      "step": 1085
+    },
+    {
+      "epoch": 0.34752,
+      "grad_norm": 0.36157521739080134,
+      "learning_rate": 0.00015163651850978323,
+      "loss": 0.6209,
+      "step": 1086
+    },
+    {
+      "epoch": 0.34784,
+      "grad_norm": 0.37804306769636536,
+      "learning_rate": 0.00015154772929224097,
+      "loss": 0.6342,
+      "step": 1087
+    },
+    {
+      "epoch": 0.34816,
+      "grad_norm": 0.3659468089331613,
+      "learning_rate": 0.0001514588846966852,
+      "loss": 0.6017,
+      "step": 1088
+    },
+    {
+      "epoch": 0.34848,
+      "grad_norm": 0.4005300627479996,
+      "learning_rate": 0.00015136998481856217,
+      "loss": 0.6926,
+      "step": 1089
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.36799319928078683,
+      "learning_rate": 0.00015128102975337751,
+      "loss": 0.6702,
+      "step": 1090
+    },
+    {
+      "epoch": 0.34912,
+      "grad_norm": 0.4246392386445891,
+      "learning_rate": 0.00015119201959669617,
+      "loss": 0.6892,
+      "step": 1091
+    },
+    {
+      "epoch": 0.34944,
+      "grad_norm": 0.3623757114099262,
+      "learning_rate": 0.00015110295444414223,
+      "loss": 0.6689,
+      "step": 1092
+    },
+    {
+      "epoch": 0.34976,
+      "grad_norm": 0.3893692892556752,
+      "learning_rate": 0.00015101383439139885,
+      "loss": 0.6879,
+      "step": 1093
+    },
+    {
+      "epoch": 0.35008,
+      "grad_norm": 0.33343620870386564,
+      "learning_rate": 0.00015092465953420826,
+      "loss": 0.6139,
+      "step": 1094
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3431220788079396,
+      "learning_rate": 0.0001508354299683715,
+      "loss": 0.6167,
+      "step": 1095
+    },
+    {
+      "epoch": 0.35072,
+      "grad_norm": 0.3460788286937098,
+      "learning_rate": 0.00015074614578974838,
+      "loss": 0.6743,
+      "step": 1096
+    },
+    {
+      "epoch": 0.35104,
+      "grad_norm": 0.35435269334017727,
+      "learning_rate": 0.00015065680709425736,
+      "loss": 0.6402,
+      "step": 1097
+    },
+    {
+      "epoch": 0.35136,
+      "grad_norm": 0.3955384008580851,
+      "learning_rate": 0.00015056741397787552,
+      "loss": 0.7266,
+      "step": 1098
+    },
+    {
+      "epoch": 0.35168,
+      "grad_norm": 0.3720773510787015,
+      "learning_rate": 0.00015047796653663842,
+      "loss": 0.7068,
+      "step": 1099
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.36846361069983113,
+      "learning_rate": 0.00015038846486663992,
+      "loss": 0.6619,
+      "step": 1100
+    },
+    {
+      "epoch": 0.35232,
+      "grad_norm": 0.3731832828827827,
+      "learning_rate": 0.00015029890906403216,
+      "loss": 0.655,
+      "step": 1101
+    },
+    {
+      "epoch": 0.35264,
+      "grad_norm": 0.36390205898133987,
+      "learning_rate": 0.00015020929922502542,
+      "loss": 0.6291,
+      "step": 1102
+    },
+    {
+      "epoch": 0.35296,
+      "grad_norm": 0.37820250573305875,
+      "learning_rate": 0.00015011963544588806,
+      "loss": 0.653,
+      "step": 1103
+    },
+    {
+      "epoch": 0.35328,
+      "grad_norm": 0.37319611633025596,
+      "learning_rate": 0.00015002991782294643,
+      "loss": 0.6602,
+      "step": 1104
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.38281627790075523,
+      "learning_rate": 0.00014994014645258462,
+      "loss": 0.7075,
+      "step": 1105
+    },
+    {
+      "epoch": 0.35392,
+      "grad_norm": 0.36365256193161577,
+      "learning_rate": 0.0001498503214312445,
+      "loss": 0.679,
+      "step": 1106
+    },
+    {
+      "epoch": 0.35424,
+      "grad_norm": 0.38319966869436156,
+      "learning_rate": 0.00014976044285542562,
+      "loss": 0.6504,
+      "step": 1107
+    },
+    {
+      "epoch": 0.35456,
+      "grad_norm": 0.3902505783573718,
+      "learning_rate": 0.00014967051082168505,
+      "loss": 0.658,
+      "step": 1108
+    },
+    {
+      "epoch": 0.35488,
+      "grad_norm": 0.38244811334158846,
+      "learning_rate": 0.00014958052542663727,
+      "loss": 0.6883,
+      "step": 1109
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.36380406935396126,
+      "learning_rate": 0.0001494904867669541,
+      "loss": 0.635,
+      "step": 1110
+    },
+    {
+      "epoch": 0.35552,
+      "grad_norm": 0.36506113091519193,
+      "learning_rate": 0.00014940039493936452,
+      "loss": 0.6451,
+      "step": 1111
+    },
+    {
+      "epoch": 0.35584,
+      "grad_norm": 0.36786582084111225,
+      "learning_rate": 0.00014931025004065476,
+      "loss": 0.6817,
+      "step": 1112
+    },
+    {
+      "epoch": 0.35616,
+      "grad_norm": 0.3352380674175264,
+      "learning_rate": 0.00014922005216766793,
+      "loss": 0.6073,
+      "step": 1113
+    },
+    {
+      "epoch": 0.35648,
+      "grad_norm": 0.3740820351794441,
+      "learning_rate": 0.00014912980141730412,
+      "loss": 0.667,
+      "step": 1114
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3451447785259785,
+      "learning_rate": 0.00014903949788652024,
+      "loss": 0.6027,
+      "step": 1115
+    },
+    {
+      "epoch": 0.35712,
+      "grad_norm": 0.33993376634128447,
+      "learning_rate": 0.00014894914167232987,
+      "loss": 0.6378,
+      "step": 1116
+    },
+    {
+      "epoch": 0.35744,
+      "grad_norm": 0.36844012858525965,
+      "learning_rate": 0.00014885873287180318,
+      "loss": 0.6775,
+      "step": 1117
+    },
+    {
+      "epoch": 0.35776,
+      "grad_norm": 0.36975490164036573,
+      "learning_rate": 0.00014876827158206686,
+      "loss": 0.689,
+      "step": 1118
+    },
+    {
+      "epoch": 0.35808,
+      "grad_norm": 0.36327834327863406,
+      "learning_rate": 0.00014867775790030398,
+      "loss": 0.6936,
+      "step": 1119
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.33425609426282454,
+      "learning_rate": 0.00014858719192375387,
+      "loss": 0.6049,
+      "step": 1120
+    },
+    {
+      "epoch": 0.35872,
+      "grad_norm": 0.3655351336389169,
+      "learning_rate": 0.0001484965737497121,
+      "loss": 0.6658,
+      "step": 1121
+    },
+    {
+      "epoch": 0.35904,
+      "grad_norm": 0.3442095695084306,
+      "learning_rate": 0.00014840590347553028,
+      "loss": 0.6263,
+      "step": 1122
+    },
+    {
+      "epoch": 0.35936,
+      "grad_norm": 0.3464205548065845,
+      "learning_rate": 0.00014831518119861597,
+      "loss": 0.6626,
+      "step": 1123
+    },
+    {
+      "epoch": 0.35968,
+      "grad_norm": 0.40890607498456205,
+      "learning_rate": 0.0001482244070164326,
+      "loss": 0.7037,
+      "step": 1124
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.3576923789465385,
+      "learning_rate": 0.00014813358102649943,
+      "loss": 0.601,
+      "step": 1125
+    },
+    {
+      "epoch": 0.36032,
+      "grad_norm": 0.3623456455719011,
+      "learning_rate": 0.00014804270332639133,
+      "loss": 0.7096,
+      "step": 1126
+    },
+    {
+      "epoch": 0.36064,
+      "grad_norm": 0.3543078306536569,
+      "learning_rate": 0.00014795177401373866,
+      "loss": 0.6231,
+      "step": 1127
+    },
+    {
+      "epoch": 0.36096,
+      "grad_norm": 0.37705917299442426,
+      "learning_rate": 0.00014786079318622732,
+      "loss": 0.703,
+      "step": 1128
+    },
+    {
+      "epoch": 0.36128,
+      "grad_norm": 0.36222471159585806,
+      "learning_rate": 0.00014776976094159854,
+      "loss": 0.6418,
+      "step": 1129
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3633471700280255,
+      "learning_rate": 0.00014767867737764876,
+      "loss": 0.6537,
+      "step": 1130
+    },
+    {
+      "epoch": 0.36192,
+      "grad_norm": 0.362633231449119,
+      "learning_rate": 0.00014758754259222955,
+      "loss": 0.6712,
+      "step": 1131
+    },
+    {
+      "epoch": 0.36224,
+      "grad_norm": 0.34320157126623524,
+      "learning_rate": 0.00014749635668324755,
+      "loss": 0.6682,
+      "step": 1132
+    },
+    {
+      "epoch": 0.36256,
+      "grad_norm": 0.3666364646050061,
+      "learning_rate": 0.00014740511974866425,
+      "loss": 0.7158,
+      "step": 1133
+    },
+    {
+      "epoch": 0.36288,
+      "grad_norm": 0.3670979884963596,
+      "learning_rate": 0.00014731383188649596,
+      "loss": 0.6555,
+      "step": 1134
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.35817734403737456,
+      "learning_rate": 0.00014722249319481384,
+      "loss": 0.6652,
+      "step": 1135
+    },
+    {
+      "epoch": 0.36352,
+      "grad_norm": 0.3839821900635269,
+      "learning_rate": 0.00014713110377174356,
+      "loss": 0.6639,
+      "step": 1136
+    },
+    {
+      "epoch": 0.36384,
+      "grad_norm": 0.3388437054909136,
+      "learning_rate": 0.00014703966371546517,
+      "loss": 0.6246,
+      "step": 1137
+    },
+    {
+      "epoch": 0.36416,
+      "grad_norm": 0.35378792334740317,
+      "learning_rate": 0.0001469481731242133,
+      "loss": 0.6469,
+      "step": 1138
+    },
+    {
+      "epoch": 0.36448,
+      "grad_norm": 0.3755234393310129,
+      "learning_rate": 0.00014685663209627688,
+      "loss": 0.6616,
+      "step": 1139
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3663473507265074,
+      "learning_rate": 0.0001467650407299988,
+      "loss": 0.6296,
+      "step": 1140
+    },
+    {
+      "epoch": 0.36512,
+      "grad_norm": 0.3489172688394357,
+      "learning_rate": 0.0001466733991237763,
+      "loss": 0.6181,
+      "step": 1141
+    },
+    {
+      "epoch": 0.36544,
+      "grad_norm": 0.34005200209869935,
+      "learning_rate": 0.00014658170737606038,
+      "loss": 0.5694,
+      "step": 1142
+    },
+    {
+      "epoch": 0.36576,
+      "grad_norm": 0.3480946148868714,
+      "learning_rate": 0.00014648996558535606,
+      "loss": 0.6509,
+      "step": 1143
+    },
+    {
+      "epoch": 0.36608,
+      "grad_norm": 0.38332042967451024,
+      "learning_rate": 0.00014639817385022206,
+      "loss": 0.6612,
+      "step": 1144
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.3764488223683972,
+      "learning_rate": 0.00014630633226927068,
+      "loss": 0.6569,
+      "step": 1145
+    },
+    {
+      "epoch": 0.36672,
+      "grad_norm": 0.35554693281053207,
+      "learning_rate": 0.00014621444094116792,
+      "loss": 0.6779,
+      "step": 1146
+    },
+    {
+      "epoch": 0.36704,
+      "grad_norm": 0.35042745424800764,
+      "learning_rate": 0.0001461224999646331,
+      "loss": 0.6335,
+      "step": 1147
+    },
+    {
+      "epoch": 0.36736,
+      "grad_norm": 0.37137182264251023,
+      "learning_rate": 0.00014603050943843898,
+      "loss": 0.6843,
+      "step": 1148
+    },
+    {
+      "epoch": 0.36768,
+      "grad_norm": 0.3805604460905857,
+      "learning_rate": 0.00014593846946141142,
+      "loss": 0.6908,
+      "step": 1149
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.35950530577255946,
+      "learning_rate": 0.00014584638013242953,
+      "loss": 0.6292,
+      "step": 1150
+    },
+    {
+      "epoch": 0.36832,
+      "grad_norm": 0.3480550930098258,
+      "learning_rate": 0.00014575424155042536,
+      "loss": 0.6137,
+      "step": 1151
+    },
+    {
+      "epoch": 0.36864,
+      "grad_norm": 0.351338579024904,
+      "learning_rate": 0.00014566205381438395,
+      "loss": 0.6323,
+      "step": 1152
+    },
+    {
+      "epoch": 0.36896,
+      "grad_norm": 0.38514115687406036,
+      "learning_rate": 0.0001455698170233431,
+      "loss": 0.6585,
+      "step": 1153
+    },
+    {
+      "epoch": 0.36928,
+      "grad_norm": 0.38456616315153086,
+      "learning_rate": 0.00014547753127639324,
+      "loss": 0.6892,
+      "step": 1154
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.36196125122106154,
+      "learning_rate": 0.00014538519667267754,
+      "loss": 0.6207,
+      "step": 1155
+    },
+    {
+      "epoch": 0.36992,
+      "grad_norm": 0.34800232564616895,
+      "learning_rate": 0.00014529281331139153,
+      "loss": 0.6546,
+      "step": 1156
+    },
+    {
+      "epoch": 0.37024,
+      "grad_norm": 0.36055734006233836,
+      "learning_rate": 0.0001452003812917832,
+      "loss": 0.6532,
+      "step": 1157
+    },
+    {
+      "epoch": 0.37056,
+      "grad_norm": 0.4048817129042739,
+      "learning_rate": 0.00014510790071315278,
+      "loss": 0.6248,
+      "step": 1158
+    },
+    {
+      "epoch": 0.37088,
+      "grad_norm": 0.3607998948655766,
+      "learning_rate": 0.00014501537167485267,
+      "loss": 0.6351,
+      "step": 1159
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3508315751082414,
+      "learning_rate": 0.0001449227942762873,
+      "loss": 0.6193,
+      "step": 1160
+    },
+    {
+      "epoch": 0.37152,
+      "grad_norm": 0.4014065812136743,
+      "learning_rate": 0.0001448301686169131,
+      "loss": 0.6222,
+      "step": 1161
+    },
+    {
+      "epoch": 0.37184,
+      "grad_norm": 0.3464640299726335,
+      "learning_rate": 0.0001447374947962384,
+      "loss": 0.6205,
+      "step": 1162
+    },
+    {
+      "epoch": 0.37216,
+      "grad_norm": 0.36150707547216376,
+      "learning_rate": 0.00014464477291382315,
+      "loss": 0.6773,
+      "step": 1163
+    },
+    {
+      "epoch": 0.37248,
+      "grad_norm": 0.37605695631058256,
+      "learning_rate": 0.00014455200306927893,
+      "loss": 0.6612,
+      "step": 1164
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3605930599009112,
+      "learning_rate": 0.00014445918536226904,
+      "loss": 0.6628,
+      "step": 1165
+    },
+    {
+      "epoch": 0.37312,
+      "grad_norm": 0.37657130051925736,
+      "learning_rate": 0.00014436631989250793,
+      "loss": 0.622,
+      "step": 1166
+    },
+    {
+      "epoch": 0.37344,
+      "grad_norm": 0.3360705081746471,
+      "learning_rate": 0.00014427340675976158,
+      "loss": 0.6472,
+      "step": 1167
+    },
+    {
+      "epoch": 0.37376,
+      "grad_norm": 0.3692092099637814,
+      "learning_rate": 0.000144180446063847,
+      "loss": 0.643,
+      "step": 1168
+    },
+    {
+      "epoch": 0.37408,
+      "grad_norm": 0.3530391333825638,
+      "learning_rate": 0.00014408743790463247,
+      "loss": 0.6797,
+      "step": 1169
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.36097002707572645,
+      "learning_rate": 0.00014399438238203716,
+      "loss": 0.6374,
+      "step": 1170
+    },
+    {
+      "epoch": 0.37472,
+      "grad_norm": 0.35146178271190726,
+      "learning_rate": 0.00014390127959603108,
+      "loss": 0.6303,
+      "step": 1171
+    },
+    {
+      "epoch": 0.37504,
+      "grad_norm": 0.37091439055456177,
+      "learning_rate": 0.00014380812964663513,
+      "loss": 0.7094,
+      "step": 1172
+    },
+    {
+      "epoch": 0.37536,
+      "grad_norm": 0.36032936147731837,
+      "learning_rate": 0.0001437149326339208,
+      "loss": 0.7257,
+      "step": 1173
+    },
+    {
+      "epoch": 0.37568,
+      "grad_norm": 0.36897899886159996,
+      "learning_rate": 0.00014362168865801017,
+      "loss": 0.696,
+      "step": 1174
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.3660038800899447,
+      "learning_rate": 0.00014352839781907578,
+      "loss": 0.6875,
+      "step": 1175
+    },
+    {
+      "epoch": 0.37632,
+      "grad_norm": 0.3358799316426415,
+      "learning_rate": 0.00014343506021734044,
+      "loss": 0.6353,
+      "step": 1176
+    },
+    {
+      "epoch": 0.37664,
+      "grad_norm": 0.3453952039192468,
+      "learning_rate": 0.00014334167595307732,
+      "loss": 0.6392,
+      "step": 1177
+    },
+    {
+      "epoch": 0.37696,
+      "grad_norm": 0.3493594546931393,
+      "learning_rate": 0.0001432482451266096,
+      "loss": 0.7301,
+      "step": 1178
+    },
+    {
+      "epoch": 0.37728,
+      "grad_norm": 0.35460348035497186,
+      "learning_rate": 0.0001431547678383106,
+      "loss": 0.6647,
+      "step": 1179
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.35492765885097327,
+      "learning_rate": 0.00014306124418860347,
+      "loss": 0.6736,
+      "step": 1180
+    },
+    {
+      "epoch": 0.37792,
+      "grad_norm": 0.34242203428065143,
+      "learning_rate": 0.00014296767427796116,
+      "loss": 0.6222,
+      "step": 1181
+    },
+    {
+      "epoch": 0.37824,
+      "grad_norm": 0.3733630052080278,
+      "learning_rate": 0.00014287405820690636,
+      "loss": 0.659,
+      "step": 1182
+    },
+    {
+      "epoch": 0.37856,
+      "grad_norm": 0.36465866936407954,
+      "learning_rate": 0.00014278039607601136,
+      "loss": 0.6454,
+      "step": 1183
+    },
+    {
+      "epoch": 0.37888,
+      "grad_norm": 0.35384191111172697,
+      "learning_rate": 0.00014268668798589793,
+      "loss": 0.6069,
+      "step": 1184
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.3512339262141367,
+      "learning_rate": 0.00014259293403723716,
+      "loss": 0.6114,
+      "step": 1185
+    },
+    {
+      "epoch": 0.37952,
+      "grad_norm": 0.34354806060619186,
+      "learning_rate": 0.0001424991343307494,
+      "loss": 0.6582,
+      "step": 1186
+    },
+    {
+      "epoch": 0.37984,
+      "grad_norm": 0.391948641056348,
+      "learning_rate": 0.0001424052889672043,
+      "loss": 0.6364,
+      "step": 1187
+    },
+    {
+      "epoch": 0.38016,
+      "grad_norm": 0.3587244271259095,
+      "learning_rate": 0.00014231139804742036,
+      "loss": 0.6301,
+      "step": 1188
+    },
+    {
+      "epoch": 0.38048,
+      "grad_norm": 0.369912259095861,
+      "learning_rate": 0.00014221746167226518,
+      "loss": 0.6767,
+      "step": 1189
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.36033411888946815,
+      "learning_rate": 0.00014212347994265508,
+      "loss": 0.6407,
+      "step": 1190
+    },
+    {
+      "epoch": 0.38112,
+      "grad_norm": 0.35168942156212024,
+      "learning_rate": 0.0001420294529595552,
+      "loss": 0.6869,
+      "step": 1191
+    },
+    {
+      "epoch": 0.38144,
+      "grad_norm": 0.366334501959747,
+      "learning_rate": 0.00014193538082397927,
+      "loss": 0.6365,
+      "step": 1192
+    },
+    {
+      "epoch": 0.38176,
+      "grad_norm": 0.3551506950560667,
+      "learning_rate": 0.0001418412636369895,
+      "loss": 0.6427,
+      "step": 1193
+    },
+    {
+      "epoch": 0.38208,
+      "grad_norm": 0.349252047418438,
+      "learning_rate": 0.00014174710149969646,
+      "loss": 0.6261,
+      "step": 1194
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.344027119598442,
+      "learning_rate": 0.00014165289451325907,
+      "loss": 0.6468,
+      "step": 1195
+    },
+    {
+      "epoch": 0.38272,
+      "grad_norm": 0.3517115187781731,
+      "learning_rate": 0.0001415586427788845,
+      "loss": 0.6103,
+      "step": 1196
+    },
+    {
+      "epoch": 0.38304,
+      "grad_norm": 0.3538346711167031,
+      "learning_rate": 0.00014146434639782782,
+      "loss": 0.6111,
+      "step": 1197
+    },
+    {
+      "epoch": 0.38336,
+      "grad_norm": 0.3643372781369026,
+      "learning_rate": 0.00014137000547139223,
+      "loss": 0.7294,
+      "step": 1198
+    },
+    {
+      "epoch": 0.38368,
+      "grad_norm": 0.35356490707573734,
+      "learning_rate": 0.00014127562010092865,
+      "loss": 0.6645,
+      "step": 1199
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.35479395393239455,
+      "learning_rate": 0.00014118119038783588,
+      "loss": 0.6414,
+      "step": 1200
+    },
+    {
+      "epoch": 0.38432,
+      "grad_norm": 0.3724724382878393,
+      "learning_rate": 0.00014108671643356025,
+      "loss": 0.6267,
+      "step": 1201
+    },
+    {
+      "epoch": 0.38464,
+      "grad_norm": 0.3498487938229309,
+      "learning_rate": 0.00014099219833959564,
+      "loss": 0.5792,
+      "step": 1202
+    },
+    {
+      "epoch": 0.38496,
+      "grad_norm": 0.4074467207222055,
+      "learning_rate": 0.00014089763620748339,
+      "loss": 0.6823,
+      "step": 1203
+    },
+    {
+      "epoch": 0.38528,
+      "grad_norm": 0.3571418637491229,
+      "learning_rate": 0.00014080303013881207,
+      "loss": 0.6407,
+      "step": 1204
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.36884655527109417,
+      "learning_rate": 0.00014070838023521763,
+      "loss": 0.6547,
+      "step": 1205
+    },
+    {
+      "epoch": 0.38592,
+      "grad_norm": 0.3825316556449097,
+      "learning_rate": 0.00014061368659838293,
+      "loss": 0.6766,
+      "step": 1206
+    },
+    {
+      "epoch": 0.38624,
+      "grad_norm": 0.37688498463873227,
+      "learning_rate": 0.00014051894933003782,
+      "loss": 0.6741,
+      "step": 1207
+    },
+    {
+      "epoch": 0.38656,
+      "grad_norm": 0.3457593953938326,
+      "learning_rate": 0.00014042416853195914,
+      "loss": 0.645,
+      "step": 1208
+    },
+    {
+      "epoch": 0.38688,
+      "grad_norm": 0.3545257775885476,
+      "learning_rate": 0.0001403293443059704,
+      "loss": 0.6533,
+      "step": 1209
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.37147392400843676,
+      "learning_rate": 0.0001402344767539418,
+      "loss": 0.6434,
+      "step": 1210
+    },
+    {
+      "epoch": 0.38752,
+      "grad_norm": 0.34665788800852526,
+      "learning_rate": 0.0001401395659777901,
+      "loss": 0.6197,
+      "step": 1211
+    },
+    {
+      "epoch": 0.38784,
+      "grad_norm": 0.36470091984128866,
+      "learning_rate": 0.00014004461207947847,
+      "loss": 0.6977,
+      "step": 1212
+    },
+    {
+      "epoch": 0.38816,
+      "grad_norm": 0.40017899300612314,
+      "learning_rate": 0.00013994961516101642,
+      "loss": 0.6801,
+      "step": 1213
+    },
+    {
+      "epoch": 0.38848,
+      "grad_norm": 0.3508062826536708,
+      "learning_rate": 0.00013985457532445962,
+      "loss": 0.6162,
+      "step": 1214
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.3565330428544234,
+      "learning_rate": 0.00013975949267190996,
+      "loss": 0.6848,
+      "step": 1215
+    },
+    {
+      "epoch": 0.38912,
+      "grad_norm": 0.36592503414841004,
+      "learning_rate": 0.00013966436730551525,
+      "loss": 0.6225,
+      "step": 1216
+    },
+    {
+      "epoch": 0.38944,
+      "grad_norm": 0.3631019284931274,
+      "learning_rate": 0.00013956919932746914,
+      "loss": 0.6931,
+      "step": 1217
+    },
+    {
+      "epoch": 0.38976,
+      "grad_norm": 0.499376685599847,
+      "learning_rate": 0.00013947398884001121,
+      "loss": 0.6957,
+      "step": 1218
+    },
+    {
+      "epoch": 0.39008,
+      "grad_norm": 0.3669103263170906,
+      "learning_rate": 0.0001393787359454266,
+      "loss": 0.6645,
+      "step": 1219
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.36108966074941806,
+      "learning_rate": 0.00013928344074604597,
+      "loss": 0.6159,
+      "step": 1220
+    },
+    {
+      "epoch": 0.39072,
+      "grad_norm": 0.3310372149611766,
+      "learning_rate": 0.00013918810334424554,
+      "loss": 0.605,
+      "step": 1221
+    },
+    {
+      "epoch": 0.39104,
+      "grad_norm": 0.36391317255929906,
+      "learning_rate": 0.00013909272384244679,
+      "loss": 0.6329,
+      "step": 1222
+    },
+    {
+      "epoch": 0.39136,
+      "grad_norm": 0.36071028090097496,
+      "learning_rate": 0.00013899730234311644,
+      "loss": 0.6457,
+      "step": 1223
+    },
+    {
+      "epoch": 0.39168,
+      "grad_norm": 0.3606414251260815,
+      "learning_rate": 0.00013890183894876642,
+      "loss": 0.6474,
+      "step": 1224
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.3855067095452181,
+      "learning_rate": 0.00013880633376195348,
+      "loss": 0.5796,
+      "step": 1225
+    },
+    {
+      "epoch": 0.39232,
+      "grad_norm": 0.3698567294505442,
+      "learning_rate": 0.00013871078688527943,
+      "loss": 0.6322,
+      "step": 1226
+    },
+    {
+      "epoch": 0.39264,
+      "grad_norm": 0.3999914294986726,
+      "learning_rate": 0.00013861519842139078,
+      "loss": 0.6893,
+      "step": 1227
+    },
+    {
+      "epoch": 0.39296,
+      "grad_norm": 0.3774958160029902,
+      "learning_rate": 0.00013851956847297882,
+      "loss": 0.6137,
+      "step": 1228
+    },
+    {
+      "epoch": 0.39328,
+      "grad_norm": 0.3527654104745092,
+      "learning_rate": 0.00013842389714277927,
+      "loss": 0.6006,
+      "step": 1229
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3610566136205061,
+      "learning_rate": 0.0001383281845335724,
+      "loss": 0.6642,
+      "step": 1230
+    },
+    {
+      "epoch": 0.39392,
+      "grad_norm": 0.3663583619345716,
+      "learning_rate": 0.00013823243074818277,
+      "loss": 0.647,
+      "step": 1231
+    },
+    {
+      "epoch": 0.39424,
+      "grad_norm": 0.3504005929992112,
+      "learning_rate": 0.00013813663588947925,
+      "loss": 0.6456,
+      "step": 1232
+    },
+    {
+      "epoch": 0.39456,
+      "grad_norm": 0.3508202934329224,
+      "learning_rate": 0.00013804080006037478,
+      "loss": 0.6111,
+      "step": 1233
+    },
+    {
+      "epoch": 0.39488,
+      "grad_norm": 0.3447725408829163,
+      "learning_rate": 0.00013794492336382635,
+      "loss": 0.6204,
+      "step": 1234
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.36481364877718714,
+      "learning_rate": 0.00013784900590283473,
+      "loss": 0.6495,
+      "step": 1235
+    },
+    {
+      "epoch": 0.39552,
+      "grad_norm": 0.36043626294563597,
+      "learning_rate": 0.0001377530477804447,
+      "loss": 0.6193,
+      "step": 1236
+    },
+    {
+      "epoch": 0.39584,
+      "grad_norm": 0.47577918253085855,
+      "learning_rate": 0.0001376570490997446,
+      "loss": 0.6589,
+      "step": 1237
+    },
+    {
+      "epoch": 0.39616,
+      "grad_norm": 0.36035416220427785,
+      "learning_rate": 0.00013756100996386626,
+      "loss": 0.6718,
+      "step": 1238
+    },
+    {
+      "epoch": 0.39648,
+      "grad_norm": 0.3787175747447074,
+      "learning_rate": 0.00013746493047598512,
+      "loss": 0.616,
+      "step": 1239
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.35635790868036293,
+      "learning_rate": 0.00013736881073931993,
+      "loss": 0.6541,
+      "step": 1240
+    },
+    {
+      "epoch": 0.39712,
+      "grad_norm": 0.38499276580247926,
+      "learning_rate": 0.00013727265085713264,
+      "loss": 0.6556,
+      "step": 1241
+    },
+    {
+      "epoch": 0.39744,
+      "grad_norm": 0.3492776702835988,
+      "learning_rate": 0.00013717645093272833,
+      "loss": 0.6559,
+      "step": 1242
+    },
+    {
+      "epoch": 0.39776,
+      "grad_norm": 0.3321756495386628,
+      "learning_rate": 0.00013708021106945514,
+      "loss": 0.6064,
+      "step": 1243
+    },
+    {
+      "epoch": 0.39808,
+      "grad_norm": 0.3758795103904307,
+      "learning_rate": 0.00013698393137070403,
+      "loss": 0.6852,
+      "step": 1244
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.40288062578089195,
+      "learning_rate": 0.00013688761193990888,
+      "loss": 0.6303,
+      "step": 1245
+    },
+    {
+      "epoch": 0.39872,
+      "grad_norm": 0.3494164272649773,
+      "learning_rate": 0.00013679125288054621,
+      "loss": 0.6037,
+      "step": 1246
+    },
+    {
+      "epoch": 0.39904,
+      "grad_norm": 0.35293284207782044,
+      "learning_rate": 0.00013669485429613506,
+      "loss": 0.6114,
+      "step": 1247
+    },
+    {
+      "epoch": 0.39936,
+      "grad_norm": 0.3566926311089219,
+      "learning_rate": 0.00013659841629023696,
+      "loss": 0.6565,
+      "step": 1248
+    },
+    {
+      "epoch": 0.39968,
+      "grad_norm": 0.3573981006039202,
+      "learning_rate": 0.00013650193896645583,
+      "loss": 0.643,
+      "step": 1249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3462914767129748,
+      "learning_rate": 0.00013640542242843778,
+      "loss": 0.5933,
+      "step": 1250
+    },
+    {
+      "epoch": 0.40032,
+      "grad_norm": 0.38179837226718827,
+      "learning_rate": 0.00013630886677987107,
+      "loss": 0.6943,
+      "step": 1251
+    },
+    {
+      "epoch": 0.40064,
+      "grad_norm": 0.36061348926406517,
+      "learning_rate": 0.00013621227212448598,
+      "loss": 0.636,
+      "step": 1252
+    },
+    {
+      "epoch": 0.40096,
+      "grad_norm": 0.3816641614097142,
+      "learning_rate": 0.00013611563856605463,
+      "loss": 0.6302,
+      "step": 1253
+    },
+    {
+      "epoch": 0.40128,
+      "grad_norm": 0.37524463793177804,
+      "learning_rate": 0.00013601896620839108,
+      "loss": 0.6351,
+      "step": 1254
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3819320271157617,
+      "learning_rate": 0.00013592225515535094,
+      "loss": 0.6754,
+      "step": 1255
+    },
+    {
+      "epoch": 0.40192,
+      "grad_norm": 0.3635548982949283,
+      "learning_rate": 0.00013582550551083142,
+      "loss": 0.6361,
+      "step": 1256
+    },
+    {
+      "epoch": 0.40224,
+      "grad_norm": 0.3580769880672606,
+      "learning_rate": 0.0001357287173787712,
+      "loss": 0.6461,
+      "step": 1257
+    },
+    {
+      "epoch": 0.40256,
+      "grad_norm": 0.372916927434414,
+      "learning_rate": 0.0001356318908631504,
+      "loss": 0.6322,
+      "step": 1258
+    },
+    {
+      "epoch": 0.40288,
+      "grad_norm": 0.3976583442627668,
+      "learning_rate": 0.00013553502606799018,
+      "loss": 0.6387,
+      "step": 1259
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.36388274254846326,
+      "learning_rate": 0.00013543812309735296,
+      "loss": 0.6673,
+      "step": 1260
+    },
+    {
+      "epoch": 0.40352,
+      "grad_norm": 0.3617105015278976,
+      "learning_rate": 0.00013534118205534216,
+      "loss": 0.6036,
+      "step": 1261
+    },
+    {
+      "epoch": 0.40384,
+      "grad_norm": 0.3564492535214902,
+      "learning_rate": 0.0001352442030461021,
+      "loss": 0.6728,
+      "step": 1262
+    },
+    {
+      "epoch": 0.40416,
+      "grad_norm": 0.3875792443666019,
+      "learning_rate": 0.00013514718617381778,
+      "loss": 0.6609,
+      "step": 1263
+    },
+    {
+      "epoch": 0.40448,
+      "grad_norm": 0.3850308563040858,
+      "learning_rate": 0.0001350501315427151,
+      "loss": 0.6836,
+      "step": 1264
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.33943646109194714,
+      "learning_rate": 0.0001349530392570603,
+      "loss": 0.6344,
+      "step": 1265
+    },
+    {
+      "epoch": 0.40512,
+      "grad_norm": 0.3465323506995736,
+      "learning_rate": 0.00013485590942116017,
+      "loss": 0.556,
+      "step": 1266
+    },
+    {
+      "epoch": 0.40544,
+      "grad_norm": 0.3605078381500674,
+      "learning_rate": 0.00013475874213936189,
+      "loss": 0.5765,
+      "step": 1267
+    },
+    {
+      "epoch": 0.40576,
+      "grad_norm": 0.3554480016174343,
+      "learning_rate": 0.00013466153751605275,
+      "loss": 0.6432,
+      "step": 1268
+    },
+    {
+      "epoch": 0.40608,
+      "grad_norm": 0.33018202183452494,
+      "learning_rate": 0.00013456429565566027,
+      "loss": 0.6288,
+      "step": 1269
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3690413857324501,
+      "learning_rate": 0.0001344670166626519,
+      "loss": 0.6723,
+      "step": 1270
+    },
+    {
+      "epoch": 0.40672,
+      "grad_norm": 0.4280733453069583,
+      "learning_rate": 0.000134369700641535,
+      "loss": 0.6293,
+      "step": 1271
+    },
+    {
+      "epoch": 0.40704,
+      "grad_norm": 0.34315319646574005,
+      "learning_rate": 0.00013427234769685674,
+      "loss": 0.6653,
+      "step": 1272
+    },
+    {
+      "epoch": 0.40736,
+      "grad_norm": 0.453466689810093,
+      "learning_rate": 0.0001341749579332039,
+      "loss": 0.659,
+      "step": 1273
+    },
+    {
+      "epoch": 0.40768,
+      "grad_norm": 0.35292056287522733,
+      "learning_rate": 0.00013407753145520287,
+      "loss": 0.6499,
+      "step": 1274
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.38587237468717983,
+      "learning_rate": 0.00013398006836751945,
+      "loss": 0.6537,
+      "step": 1275
+    },
+    {
+      "epoch": 0.40832,
+      "grad_norm": 0.3529306070695278,
+      "learning_rate": 0.0001338825687748588,
+      "loss": 0.6817,
+      "step": 1276
+    },
+    {
+      "epoch": 0.40864,
+      "grad_norm": 0.36051844834341557,
+      "learning_rate": 0.00013378503278196522,
+      "loss": 0.6337,
+      "step": 1277
+    },
+    {
+      "epoch": 0.40896,
+      "grad_norm": 0.343321159940691,
+      "learning_rate": 0.00013368746049362225,
+      "loss": 0.6529,
+      "step": 1278
+    },
+    {
+      "epoch": 0.40928,
+      "grad_norm": 0.3351048408709637,
+      "learning_rate": 0.00013358985201465226,
+      "loss": 0.6232,
+      "step": 1279
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3435219396633898,
+      "learning_rate": 0.00013349220744991665,
+      "loss": 0.6401,
+      "step": 1280
+    },
+    {
+      "epoch": 0.40992,
+      "grad_norm": 0.4067975516341677,
+      "learning_rate": 0.0001333945269043155,
+      "loss": 0.7252,
+      "step": 1281
+    },
+    {
+      "epoch": 0.41024,
+      "grad_norm": 0.35214924212694954,
+      "learning_rate": 0.0001332968104827876,
+      "loss": 0.6165,
+      "step": 1282
+    },
+    {
+      "epoch": 0.41056,
+      "grad_norm": 0.34850375228792707,
+      "learning_rate": 0.00013319905829031016,
+      "loss": 0.6418,
+      "step": 1283
+    },
+    {
+      "epoch": 0.41088,
+      "grad_norm": 0.34318201572025187,
+      "learning_rate": 0.000133101270431899,
+      "loss": 0.7001,
+      "step": 1284
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3733743863546783,
+      "learning_rate": 0.0001330034470126081,
+      "loss": 0.6991,
+      "step": 1285
+    },
+    {
+      "epoch": 0.41152,
+      "grad_norm": 0.3770948893491864,
+      "learning_rate": 0.00013290558813752976,
+      "loss": 0.6659,
+      "step": 1286
+    },
+    {
+      "epoch": 0.41184,
+      "grad_norm": 0.35994327221867084,
+      "learning_rate": 0.00013280769391179427,
+      "loss": 0.6326,
+      "step": 1287
+    },
+    {
+      "epoch": 0.41216,
+      "grad_norm": 0.34861119605672714,
+      "learning_rate": 0.00013270976444056993,
+      "loss": 0.618,
+      "step": 1288
+    },
+    {
+      "epoch": 0.41248,
+      "grad_norm": 0.3568665716847094,
+      "learning_rate": 0.00013261179982906296,
+      "loss": 0.6713,
+      "step": 1289
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.36238510851647426,
+      "learning_rate": 0.0001325138001825173,
+      "loss": 0.6598,
+      "step": 1290
+    },
+    {
+      "epoch": 0.41312,
+      "grad_norm": 0.3366664810713671,
+      "learning_rate": 0.00013241576560621445,
+      "loss": 0.621,
+      "step": 1291
+    },
+    {
+      "epoch": 0.41344,
+      "grad_norm": 0.3669410714488144,
+      "learning_rate": 0.00013231769620547358,
+      "loss": 0.6661,
+      "step": 1292
+    },
+    {
+      "epoch": 0.41376,
+      "grad_norm": 0.34475722878964293,
+      "learning_rate": 0.00013221959208565114,
+      "loss": 0.6483,
+      "step": 1293
+    },
+    {
+      "epoch": 0.41408,
+      "grad_norm": 0.3460746296692432,
+      "learning_rate": 0.00013212145335214097,
+      "loss": 0.6528,
+      "step": 1294
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.3730770689420592,
+      "learning_rate": 0.00013202328011037404,
+      "loss": 0.6444,
+      "step": 1295
+    },
+    {
+      "epoch": 0.41472,
+      "grad_norm": 0.3568421824960019,
+      "learning_rate": 0.0001319250724658184,
+      "loss": 0.6464,
+      "step": 1296
+    },
+    {
+      "epoch": 0.41504,
+      "grad_norm": 0.3575319229223752,
+      "learning_rate": 0.0001318268305239791,
+      "loss": 0.6099,
+      "step": 1297
+    },
+    {
+      "epoch": 0.41536,
+      "grad_norm": 0.37343788399550126,
+      "learning_rate": 0.00013172855439039802,
+      "loss": 0.6049,
+      "step": 1298
+    },
+    {
+      "epoch": 0.41568,
+      "grad_norm": 0.3764856116892189,
+      "learning_rate": 0.0001316302441706537,
+      "loss": 0.6894,
+      "step": 1299
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3573642349959537,
+      "learning_rate": 0.00013153189997036142,
+      "loss": 0.661,
+      "step": 1300
+    },
+    {
+      "epoch": 0.41632,
+      "grad_norm": 0.36075971225401227,
+      "learning_rate": 0.00013143352189517283,
+      "loss": 0.6371,
+      "step": 1301
+    },
+    {
+      "epoch": 0.41664,
+      "grad_norm": 0.35425791072986146,
+      "learning_rate": 0.0001313351100507761,
+      "loss": 0.6725,
+      "step": 1302
+    },
+    {
+      "epoch": 0.41696,
+      "grad_norm": 0.34591639250099865,
+      "learning_rate": 0.00013123666454289566,
+      "loss": 0.7016,
+      "step": 1303
+    },
+    {
+      "epoch": 0.41728,
+      "grad_norm": 0.33614920683151617,
+      "learning_rate": 0.00013113818547729202,
+      "loss": 0.6235,
+      "step": 1304
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.3673677373910372,
+      "learning_rate": 0.00013103967295976179,
+      "loss": 0.6848,
+      "step": 1305
+    },
+    {
+      "epoch": 0.41792,
+      "grad_norm": 0.34566980754791926,
+      "learning_rate": 0.00013094112709613747,
+      "loss": 0.671,
+      "step": 1306
+    },
+    {
+      "epoch": 0.41824,
+      "grad_norm": 0.3303812462162933,
+      "learning_rate": 0.00013084254799228753,
+      "loss": 0.5806,
+      "step": 1307
+    },
+    {
+      "epoch": 0.41856,
+      "grad_norm": 0.34825215783206426,
+      "learning_rate": 0.000130743935754116,
+      "loss": 0.5705,
+      "step": 1308
+    },
+    {
+      "epoch": 0.41888,
+      "grad_norm": 0.35838984534257606,
+      "learning_rate": 0.00013064529048756256,
+      "loss": 0.6494,
+      "step": 1309
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.34119239755194375,
+      "learning_rate": 0.00013054661229860238,
+      "loss": 0.63,
+      "step": 1310
+    },
+    {
+      "epoch": 0.41952,
+      "grad_norm": 0.36455797899709114,
+      "learning_rate": 0.000130447901293246,
+      "loss": 0.6876,
+      "step": 1311
+    },
+    {
+      "epoch": 0.41984,
+      "grad_norm": 0.34262004353926406,
+      "learning_rate": 0.00013034915757753916,
+      "loss": 0.6085,
+      "step": 1312
+    },
+    {
+      "epoch": 0.42016,
+      "grad_norm": 0.3503190021222328,
+      "learning_rate": 0.00013025038125756284,
+      "loss": 0.606,
+      "step": 1313
+    },
+    {
+      "epoch": 0.42048,
+      "grad_norm": 0.3816539330707386,
+      "learning_rate": 0.0001301515724394329,
+      "loss": 0.6845,
+      "step": 1314
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.3594555903017935,
+      "learning_rate": 0.00013005273122930036,
+      "loss": 0.6854,
+      "step": 1315
+    },
+    {
+      "epoch": 0.42112,
+      "grad_norm": 0.34466991551002474,
+      "learning_rate": 0.00012995385773335074,
+      "loss": 0.6392,
+      "step": 1316
+    },
+    {
+      "epoch": 0.42144,
+      "grad_norm": 0.32219332502638137,
+      "learning_rate": 0.00012985495205780447,
+      "loss": 0.6103,
+      "step": 1317
+    },
+    {
+      "epoch": 0.42176,
+      "grad_norm": 0.3416425229543437,
+      "learning_rate": 0.00012975601430891643,
+      "loss": 0.6383,
+      "step": 1318
+    },
+    {
+      "epoch": 0.42208,
+      "grad_norm": 0.36489715660687144,
+      "learning_rate": 0.00012965704459297602,
+      "loss": 0.6809,
+      "step": 1319
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.346798826084252,
+      "learning_rate": 0.00012955804301630693,
+      "loss": 0.6624,
+      "step": 1320
+    },
+    {
+      "epoch": 0.42272,
+      "grad_norm": 0.36112835877244714,
+      "learning_rate": 0.00012945900968526716,
+      "loss": 0.6684,
+      "step": 1321
+    },
+    {
+      "epoch": 0.42304,
+      "grad_norm": 0.3652280722113376,
+      "learning_rate": 0.00012935994470624875,
+      "loss": 0.6076,
+      "step": 1322
+    },
+    {
+      "epoch": 0.42336,
+      "grad_norm": 0.3574492527504861,
+      "learning_rate": 0.0001292608481856777,
+      "loss": 0.6127,
+      "step": 1323
+    },
+    {
+      "epoch": 0.42368,
+      "grad_norm": 0.3504809839004475,
+      "learning_rate": 0.00012916172023001406,
+      "loss": 0.6577,
+      "step": 1324
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3844463042997439,
+      "learning_rate": 0.00012906256094575146,
+      "loss": 0.6172,
+      "step": 1325
+    },
+    {
+      "epoch": 0.42432,
+      "grad_norm": 0.3797990055410563,
+      "learning_rate": 0.0001289633704394173,
+      "loss": 0.6419,
+      "step": 1326
+    },
+    {
+      "epoch": 0.42464,
+      "grad_norm": 0.3868650002573214,
+      "learning_rate": 0.00012886414881757246,
+      "loss": 0.6701,
+      "step": 1327
+    },
+    {
+      "epoch": 0.42496,
+      "grad_norm": 0.3528851337849618,
+      "learning_rate": 0.00012876489618681135,
+      "loss": 0.6075,
+      "step": 1328
+    },
+    {
+      "epoch": 0.42528,
+      "grad_norm": 0.34703654994128025,
+      "learning_rate": 0.0001286656126537616,
+      "loss": 0.6267,
+      "step": 1329
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.34437613160416275,
+      "learning_rate": 0.00012856629832508408,
+      "loss": 0.6553,
+      "step": 1330
+    },
+    {
+      "epoch": 0.42592,
+      "grad_norm": 0.33037427360110766,
+      "learning_rate": 0.00012846695330747266,
+      "loss": 0.6217,
+      "step": 1331
+    },
+    {
+      "epoch": 0.42624,
+      "grad_norm": 0.3762704604148889,
+      "learning_rate": 0.00012836757770765434,
+      "loss": 0.6614,
+      "step": 1332
+    },
+    {
+      "epoch": 0.42656,
+      "grad_norm": 0.33931856387279397,
+      "learning_rate": 0.0001282681716323888,
+      "loss": 0.6262,
+      "step": 1333
+    },
+    {
+      "epoch": 0.42688,
+      "grad_norm": 0.3827684139348457,
+      "learning_rate": 0.00012816873518846863,
+      "loss": 0.6486,
+      "step": 1334
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.35601274758876245,
+      "learning_rate": 0.00012806926848271886,
+      "loss": 0.6637,
+      "step": 1335
+    },
+    {
+      "epoch": 0.42752,
+      "grad_norm": 0.3510290212188483,
+      "learning_rate": 0.00012796977162199717,
+      "loss": 0.5754,
+      "step": 1336
+    },
+    {
+      "epoch": 0.42784,
+      "grad_norm": 0.35443467930466266,
+      "learning_rate": 0.00012787024471319362,
+      "loss": 0.6089,
+      "step": 1337
+    },
+    {
+      "epoch": 0.42816,
+      "grad_norm": 0.3475015658292733,
+      "learning_rate": 0.0001277706878632305,
+      "loss": 0.6315,
+      "step": 1338
+    },
+    {
+      "epoch": 0.42848,
+      "grad_norm": 0.37814373328016304,
+      "learning_rate": 0.0001276711011790623,
+      "loss": 0.6634,
+      "step": 1339
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3701372610710627,
+      "learning_rate": 0.00012757148476767553,
+      "loss": 0.6833,
+      "step": 1340
+    },
+    {
+      "epoch": 0.42912,
+      "grad_norm": 0.3387632680188932,
+      "learning_rate": 0.00012747183873608865,
+      "loss": 0.6667,
+      "step": 1341
+    },
+    {
+      "epoch": 0.42944,
+      "grad_norm": 0.35593204722664024,
+      "learning_rate": 0.00012737216319135198,
+      "loss": 0.6439,
+      "step": 1342
+    },
+    {
+      "epoch": 0.42976,
+      "grad_norm": 0.3772028607822921,
+      "learning_rate": 0.00012727245824054753,
+      "loss": 0.6426,
+      "step": 1343
+    },
+    {
+      "epoch": 0.43008,
+      "grad_norm": 0.3845568482838058,
+      "learning_rate": 0.00012717272399078884,
+      "loss": 0.6448,
+      "step": 1344
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.3447425244248575,
+      "learning_rate": 0.00012707296054922097,
+      "loss": 0.6513,
+      "step": 1345
+    },
+    {
+      "epoch": 0.43072,
+      "grad_norm": 0.33810265020031316,
+      "learning_rate": 0.00012697316802302036,
+      "loss": 0.6762,
+      "step": 1346
+    },
+    {
+      "epoch": 0.43104,
+      "grad_norm": 0.3475298850931401,
+      "learning_rate": 0.0001268733465193947,
+      "loss": 0.6061,
+      "step": 1347
+    },
+    {
+      "epoch": 0.43136,
+      "grad_norm": 0.3608790325277824,
+      "learning_rate": 0.0001267734961455828,
+      "loss": 0.6478,
+      "step": 1348
+    },
+    {
+      "epoch": 0.43168,
+      "grad_norm": 0.36508800193478946,
+      "learning_rate": 0.0001266736170088544,
+      "loss": 0.6109,
+      "step": 1349
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3383799099306968,
+      "learning_rate": 0.00012657370921651025,
+      "loss": 0.6334,
+      "step": 1350
+    },
+    {
+      "epoch": 0.43232,
+      "grad_norm": 0.35618257726364355,
+      "learning_rate": 0.00012647377287588186,
+      "loss": 0.6594,
+      "step": 1351
+    },
+    {
+      "epoch": 0.43264,
+      "grad_norm": 0.35177084347016874,
+      "learning_rate": 0.00012637380809433143,
+      "loss": 0.6484,
+      "step": 1352
+    },
+    {
+      "epoch": 0.43296,
+      "grad_norm": 0.3548705046761641,
+      "learning_rate": 0.00012627381497925163,
+      "loss": 0.6384,
+      "step": 1353
+    },
+    {
+      "epoch": 0.43328,
+      "grad_norm": 0.40688751781160165,
+      "learning_rate": 0.00012617379363806563,
+      "loss": 0.6618,
+      "step": 1354
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.3452045247527599,
+      "learning_rate": 0.00012607374417822695,
+      "loss": 0.6391,
+      "step": 1355
+    },
+    {
+      "epoch": 0.43392,
+      "grad_norm": 0.35552910751039707,
+      "learning_rate": 0.00012597366670721925,
+      "loss": 0.6074,
+      "step": 1356
+    },
+    {
+      "epoch": 0.43424,
+      "grad_norm": 0.3764260118454921,
+      "learning_rate": 0.00012587356133255632,
+      "loss": 0.6755,
+      "step": 1357
+    },
+    {
+      "epoch": 0.43456,
+      "grad_norm": 0.3533763806443953,
+      "learning_rate": 0.00012577342816178194,
+      "loss": 0.6562,
+      "step": 1358
+    },
+    {
+      "epoch": 0.43488,
+      "grad_norm": 0.3544995820110034,
+      "learning_rate": 0.0001256732673024697,
+      "loss": 0.6499,
+      "step": 1359
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.37073666265091326,
+      "learning_rate": 0.00012557307886222304,
+      "loss": 0.6247,
+      "step": 1360
+    },
+    {
+      "epoch": 0.43552,
+      "grad_norm": 0.37107082767575766,
+      "learning_rate": 0.00012547286294867487,
+      "loss": 0.5801,
+      "step": 1361
+    },
+    {
+      "epoch": 0.43584,
+      "grad_norm": 0.35227638320394655,
+      "learning_rate": 0.00012537261966948777,
+      "loss": 0.6513,
+      "step": 1362
+    },
+    {
+      "epoch": 0.43616,
+      "grad_norm": 0.3601658309861874,
+      "learning_rate": 0.00012527234913235362,
+      "loss": 0.6262,
+      "step": 1363
+    },
+    {
+      "epoch": 0.43648,
+      "grad_norm": 0.35008795436825585,
+      "learning_rate": 0.00012517205144499366,
+      "loss": 0.6378,
+      "step": 1364
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.346105470523686,
+      "learning_rate": 0.00012507172671515822,
+      "loss": 0.6192,
+      "step": 1365
+    },
+    {
+      "epoch": 0.43712,
+      "grad_norm": 0.3773640996777558,
+      "learning_rate": 0.00012497137505062674,
+      "loss": 0.6452,
+      "step": 1366
+    },
+    {
+      "epoch": 0.43744,
+      "grad_norm": 0.3468633481588172,
+      "learning_rate": 0.00012487099655920757,
+      "loss": 0.6582,
+      "step": 1367
+    },
+    {
+      "epoch": 0.43776,
+      "grad_norm": 0.38851105590023044,
+      "learning_rate": 0.00012477059134873784,
+      "loss": 0.6304,
+      "step": 1368
+    },
+    {
+      "epoch": 0.43808,
+      "grad_norm": 0.34867703247550724,
+      "learning_rate": 0.00012467015952708348,
+      "loss": 0.6342,
+      "step": 1369
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3563014878857632,
+      "learning_rate": 0.00012456970120213896,
+      "loss": 0.6053,
+      "step": 1370
+    },
+    {
+      "epoch": 0.43872,
+      "grad_norm": 0.3449361759382152,
+      "learning_rate": 0.00012446921648182716,
+      "loss": 0.6923,
+      "step": 1371
+    },
+    {
+      "epoch": 0.43904,
+      "grad_norm": 0.3641615446986733,
+      "learning_rate": 0.00012436870547409944,
+      "loss": 0.6648,
+      "step": 1372
+    },
+    {
+      "epoch": 0.43936,
+      "grad_norm": 0.3395863365971903,
+      "learning_rate": 0.0001242681682869353,
+      "loss": 0.5922,
+      "step": 1373
+    },
+    {
+      "epoch": 0.43968,
+      "grad_norm": 0.3539280047765845,
+      "learning_rate": 0.00012416760502834243,
+      "loss": 0.6219,
+      "step": 1374
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.34918862471192,
+      "learning_rate": 0.0001240670158063565,
+      "loss": 0.7103,
+      "step": 1375
+    },
+    {
+      "epoch": 0.44032,
+      "grad_norm": 0.3595735599927838,
+      "learning_rate": 0.00012396640072904103,
+      "loss": 0.5902,
+      "step": 1376
+    },
+    {
+      "epoch": 0.44064,
+      "grad_norm": 0.35856148872984034,
+      "learning_rate": 0.00012386575990448742,
+      "loss": 0.6201,
+      "step": 1377
+    },
+    {
+      "epoch": 0.44096,
+      "grad_norm": 0.37700670637280187,
+      "learning_rate": 0.0001237650934408146,
+      "loss": 0.6418,
+      "step": 1378
+    },
+    {
+      "epoch": 0.44128,
+      "grad_norm": 0.36989058657011725,
+      "learning_rate": 0.00012366440144616917,
+      "loss": 0.6457,
+      "step": 1379
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4419841347795627,
+      "learning_rate": 0.0001235636840287251,
+      "loss": 0.6498,
+      "step": 1380
+    },
+    {
+      "epoch": 0.44192,
+      "grad_norm": 0.34323897155294086,
+      "learning_rate": 0.00012346294129668366,
+      "loss": 0.6242,
+      "step": 1381
+    },
+    {
+      "epoch": 0.44224,
+      "grad_norm": 0.34643679938270927,
+      "learning_rate": 0.0001233621733582733,
+      "loss": 0.6126,
+      "step": 1382
+    },
+    {
+      "epoch": 0.44256,
+      "grad_norm": 0.3356235258047478,
+      "learning_rate": 0.00012326138032174965,
+      "loss": 0.6364,
+      "step": 1383
+    },
+    {
+      "epoch": 0.44288,
+      "grad_norm": 0.3575902692954344,
+      "learning_rate": 0.00012316056229539518,
+      "loss": 0.6369,
+      "step": 1384
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.3393711534808123,
+      "learning_rate": 0.00012305971938751924,
+      "loss": 0.6429,
+      "step": 1385
+    },
+    {
+      "epoch": 0.44352,
+      "grad_norm": 0.3684626498108965,
+      "learning_rate": 0.00012295885170645796,
+      "loss": 0.5705,
+      "step": 1386
+    },
+    {
+      "epoch": 0.44384,
+      "grad_norm": 0.3423815982347849,
+      "learning_rate": 0.00012285795936057406,
+      "loss": 0.6181,
+      "step": 1387
+    },
+    {
+      "epoch": 0.44416,
+      "grad_norm": 0.3656618249888552,
+      "learning_rate": 0.00012275704245825678,
+      "loss": 0.6802,
+      "step": 1388
+    },
+    {
+      "epoch": 0.44448,
+      "grad_norm": 0.3490014887116999,
+      "learning_rate": 0.0001226561011079216,
+      "loss": 0.6106,
+      "step": 1389
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3429221228928354,
+      "learning_rate": 0.00012255513541801049,
+      "loss": 0.6224,
+      "step": 1390
+    },
+    {
+      "epoch": 0.44512,
+      "grad_norm": 0.3491161411693488,
+      "learning_rate": 0.00012245414549699144,
+      "loss": 0.5897,
+      "step": 1391
+    },
+    {
+      "epoch": 0.44544,
+      "grad_norm": 0.3623171659441398,
+      "learning_rate": 0.00012235313145335844,
+      "loss": 0.6423,
+      "step": 1392
+    },
+    {
+      "epoch": 0.44576,
+      "grad_norm": 0.3420727006074005,
+      "learning_rate": 0.00012225209339563145,
+      "loss": 0.6036,
+      "step": 1393
+    },
+    {
+      "epoch": 0.44608,
+      "grad_norm": 0.3512269788663449,
+      "learning_rate": 0.00012215103143235623,
+      "loss": 0.6378,
+      "step": 1394
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3466087062962386,
+      "learning_rate": 0.00012204994567210426,
+      "loss": 0.583,
+      "step": 1395
+    },
+    {
+      "epoch": 0.44672,
+      "grad_norm": 0.38119716890420635,
+      "learning_rate": 0.00012194883622347246,
+      "loss": 0.6351,
+      "step": 1396
+    },
+    {
+      "epoch": 0.44704,
+      "grad_norm": 0.3625595031674755,
+      "learning_rate": 0.0001218477031950833,
+      "loss": 0.621,
+      "step": 1397
+    },
+    {
+      "epoch": 0.44736,
+      "grad_norm": 0.351603026050947,
+      "learning_rate": 0.00012174654669558454,
+      "loss": 0.6253,
+      "step": 1398
+    },
+    {
+      "epoch": 0.44768,
+      "grad_norm": 0.34827861710726954,
+      "learning_rate": 0.00012164536683364925,
+      "loss": 0.6171,
+      "step": 1399
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3602366774986206,
+      "learning_rate": 0.00012154416371797543,
+      "loss": 0.6534,
+      "step": 1400
+    },
+    {
+      "epoch": 0.44832,
+      "grad_norm": 0.35683792396738884,
+      "learning_rate": 0.0001214429374572862,
+      "loss": 0.6668,
+      "step": 1401
+    },
+    {
+      "epoch": 0.44864,
+      "grad_norm": 0.34068186682117463,
+      "learning_rate": 0.00012134168816032949,
+      "loss": 0.6373,
+      "step": 1402
+    },
+    {
+      "epoch": 0.44896,
+      "grad_norm": 0.343090577485209,
+      "learning_rate": 0.00012124041593587798,
+      "loss": 0.6159,
+      "step": 1403
+    },
+    {
+      "epoch": 0.44928,
+      "grad_norm": 0.34224599156117047,
+      "learning_rate": 0.00012113912089272898,
+      "loss": 0.6572,
+      "step": 1404
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.33193464365254555,
+      "learning_rate": 0.00012103780313970435,
+      "loss": 0.6303,
+      "step": 1405
+    },
+    {
+      "epoch": 0.44992,
+      "grad_norm": 0.369626764959218,
+      "learning_rate": 0.00012093646278565029,
+      "loss": 0.6286,
+      "step": 1406
+    },
+    {
+      "epoch": 0.45024,
+      "grad_norm": 0.35471716045787755,
+      "learning_rate": 0.00012083509993943732,
+      "loss": 0.646,
+      "step": 1407
+    },
+    {
+      "epoch": 0.45056,
+      "grad_norm": 0.3582100147411708,
+      "learning_rate": 0.00012073371470996009,
+      "loss": 0.6869,
+      "step": 1408
+    },
+    {
+      "epoch": 0.45088,
+      "grad_norm": 0.37186706964832433,
+      "learning_rate": 0.00012063230720613734,
+      "loss": 0.6737,
+      "step": 1409
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3191968885135222,
+      "learning_rate": 0.00012053087753691172,
+      "loss": 0.5869,
+      "step": 1410
+    },
+    {
+      "epoch": 0.45152,
+      "grad_norm": 0.3691993077316804,
+      "learning_rate": 0.00012042942581124967,
+      "loss": 0.6264,
+      "step": 1411
+    },
+    {
+      "epoch": 0.45184,
+      "grad_norm": 0.3695780460379041,
+      "learning_rate": 0.00012032795213814136,
+      "loss": 0.6712,
+      "step": 1412
+    },
+    {
+      "epoch": 0.45216,
+      "grad_norm": 0.3568156626276025,
+      "learning_rate": 0.00012022645662660054,
+      "loss": 0.6089,
+      "step": 1413
+    },
+    {
+      "epoch": 0.45248,
+      "grad_norm": 0.3655528196906202,
+      "learning_rate": 0.0001201249393856644,
+      "loss": 0.6389,
+      "step": 1414
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3386700688822135,
+      "learning_rate": 0.00012002340052439345,
+      "loss": 0.6517,
+      "step": 1415
+    },
+    {
+      "epoch": 0.45312,
+      "grad_norm": 0.3449331101654647,
+      "learning_rate": 0.00011992184015187145,
+      "loss": 0.6082,
+      "step": 1416
+    },
+    {
+      "epoch": 0.45344,
+      "grad_norm": 0.35254729205610263,
+      "learning_rate": 0.00011982025837720532,
+      "loss": 0.6636,
+      "step": 1417
+    },
+    {
+      "epoch": 0.45376,
+      "grad_norm": 0.35069952676512073,
+      "learning_rate": 0.00011971865530952491,
+      "loss": 0.5966,
+      "step": 1418
+    },
+    {
+      "epoch": 0.45408,
+      "grad_norm": 0.3515586909700109,
+      "learning_rate": 0.00011961703105798297,
+      "loss": 0.653,
+      "step": 1419
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.35351295595603843,
+      "learning_rate": 0.00011951538573175494,
+      "loss": 0.6445,
+      "step": 1420
+    },
+    {
+      "epoch": 0.45472,
+      "grad_norm": 0.37666401087814305,
+      "learning_rate": 0.00011941371944003905,
+      "loss": 0.6309,
+      "step": 1421
+    },
+    {
+      "epoch": 0.45504,
+      "grad_norm": 0.3469615941342529,
+      "learning_rate": 0.00011931203229205596,
+      "loss": 0.6541,
+      "step": 1422
+    },
+    {
+      "epoch": 0.45536,
+      "grad_norm": 0.35931076286043323,
+      "learning_rate": 0.00011921032439704867,
+      "loss": 0.646,
+      "step": 1423
+    },
+    {
+      "epoch": 0.45568,
+      "grad_norm": 0.3481688003934372,
+      "learning_rate": 0.00011910859586428258,
+      "loss": 0.649,
+      "step": 1424
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3953700777295994,
+      "learning_rate": 0.00011900684680304521,
+      "loss": 0.6735,
+      "step": 1425
+    },
+    {
+      "epoch": 0.45632,
+      "grad_norm": 0.35507200417141105,
+      "learning_rate": 0.00011890507732264616,
+      "loss": 0.5973,
+      "step": 1426
+    },
+    {
+      "epoch": 0.45664,
+      "grad_norm": 0.4579588714968065,
+      "learning_rate": 0.00011880328753241694,
+      "loss": 0.6464,
+      "step": 1427
+    },
+    {
+      "epoch": 0.45696,
+      "grad_norm": 0.3526760816498687,
+      "learning_rate": 0.00011870147754171093,
+      "loss": 0.6487,
+      "step": 1428
+    },
+    {
+      "epoch": 0.45728,
+      "grad_norm": 0.3613568385566994,
+      "learning_rate": 0.00011859964745990308,
+      "loss": 0.6025,
+      "step": 1429
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3604373311550373,
+      "learning_rate": 0.00011849779739639012,
+      "loss": 0.6658,
+      "step": 1430
+    },
+    {
+      "epoch": 0.45792,
+      "grad_norm": 0.36857102682555254,
+      "learning_rate": 0.00011839592746059008,
+      "loss": 0.6973,
+      "step": 1431
+    },
+    {
+      "epoch": 0.45824,
+      "grad_norm": 0.37162056867268184,
+      "learning_rate": 0.0001182940377619424,
+      "loss": 0.6365,
+      "step": 1432
+    },
+    {
+      "epoch": 0.45856,
+      "grad_norm": 0.33902705479978135,
+      "learning_rate": 0.00011819212840990778,
+      "loss": 0.6301,
+      "step": 1433
+    },
+    {
+      "epoch": 0.45888,
+      "grad_norm": 0.37413568959543353,
+      "learning_rate": 0.00011809019951396799,
+      "loss": 0.6239,
+      "step": 1434
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.35477794355157105,
+      "learning_rate": 0.00011798825118362582,
+      "loss": 0.6425,
+      "step": 1435
+    },
+    {
+      "epoch": 0.45952,
+      "grad_norm": 0.3522078087107167,
+      "learning_rate": 0.00011788628352840494,
+      "loss": 0.6418,
+      "step": 1436
+    },
+    {
+      "epoch": 0.45984,
+      "grad_norm": 0.35103797118150487,
+      "learning_rate": 0.00011778429665784978,
+      "loss": 0.6072,
+      "step": 1437
+    },
+    {
+      "epoch": 0.46016,
+      "grad_norm": 0.3353757349855207,
+      "learning_rate": 0.00011768229068152532,
+      "loss": 0.6002,
+      "step": 1438
+    },
+    {
+      "epoch": 0.46048,
+      "grad_norm": 0.38094559653695437,
+      "learning_rate": 0.00011758026570901726,
+      "loss": 0.6438,
+      "step": 1439
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3822728591127505,
+      "learning_rate": 0.00011747822184993153,
+      "loss": 0.6757,
+      "step": 1440
+    },
+    {
+      "epoch": 0.46112,
+      "grad_norm": 0.3507043929942193,
+      "learning_rate": 0.00011737615921389444,
+      "loss": 0.6217,
+      "step": 1441
+    },
+    {
+      "epoch": 0.46144,
+      "grad_norm": 0.3410123569919182,
+      "learning_rate": 0.00011727407791055244,
+      "loss": 0.6435,
+      "step": 1442
+    },
+    {
+      "epoch": 0.46176,
+      "grad_norm": 0.34433236456279837,
+      "learning_rate": 0.00011717197804957207,
+      "loss": 0.6401,
+      "step": 1443
+    },
+    {
+      "epoch": 0.46208,
+      "grad_norm": 0.3455912774447328,
+      "learning_rate": 0.00011706985974063978,
+      "loss": 0.5887,
+      "step": 1444
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3875625719562774,
+      "learning_rate": 0.00011696772309346182,
+      "loss": 0.6422,
+      "step": 1445
+    },
+    {
+      "epoch": 0.46272,
+      "grad_norm": 0.3654056864366787,
+      "learning_rate": 0.00011686556821776415,
+      "loss": 0.7212,
+      "step": 1446
+    },
+    {
+      "epoch": 0.46304,
+      "grad_norm": 0.35367123382665144,
+      "learning_rate": 0.00011676339522329232,
+      "loss": 0.6469,
+      "step": 1447
+    },
+    {
+      "epoch": 0.46336,
+      "grad_norm": 0.3338090449098278,
+      "learning_rate": 0.00011666120421981139,
+      "loss": 0.5892,
+      "step": 1448
+    },
+    {
+      "epoch": 0.46368,
+      "grad_norm": 0.3587477742861889,
+      "learning_rate": 0.0001165589953171057,
+      "loss": 0.6474,
+      "step": 1449
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3910356686835087,
+      "learning_rate": 0.00011645676862497885,
+      "loss": 0.662,
+      "step": 1450
+    },
+    {
+      "epoch": 0.46432,
+      "grad_norm": 0.3688443749662049,
+      "learning_rate": 0.00011635452425325355,
+      "loss": 0.6625,
+      "step": 1451
+    },
+    {
+      "epoch": 0.46464,
+      "grad_norm": 0.3621249395050262,
+      "learning_rate": 0.00011625226231177149,
+      "loss": 0.581,
+      "step": 1452
+    },
+    {
+      "epoch": 0.46496,
+      "grad_norm": 0.34116364664587356,
+      "learning_rate": 0.00011614998291039326,
+      "loss": 0.608,
+      "step": 1453
+    },
+    {
+      "epoch": 0.46528,
+      "grad_norm": 0.3321749101489626,
+      "learning_rate": 0.00011604768615899817,
+      "loss": 0.6658,
+      "step": 1454
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.34336578026686626,
+      "learning_rate": 0.0001159453721674842,
+      "loss": 0.6447,
+      "step": 1455
+    },
+    {
+      "epoch": 0.46592,
+      "grad_norm": 0.3390950292435595,
+      "learning_rate": 0.00011584304104576781,
+      "loss": 0.6386,
+      "step": 1456
+    },
+    {
+      "epoch": 0.46624,
+      "grad_norm": 0.33614897882321054,
+      "learning_rate": 0.00011574069290378398,
+      "loss": 0.5837,
+      "step": 1457
+    },
+    {
+      "epoch": 0.46656,
+      "grad_norm": 0.34076010531929685,
+      "learning_rate": 0.00011563832785148583,
+      "loss": 0.6049,
+      "step": 1458
+    },
+    {
+      "epoch": 0.46688,
+      "grad_norm": 0.3673040159836117,
+      "learning_rate": 0.00011553594599884471,
+      "loss": 0.6478,
+      "step": 1459
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.37603155481592304,
+      "learning_rate": 0.00011543354745585003,
+      "loss": 0.6341,
+      "step": 1460
+    },
+    {
+      "epoch": 0.46752,
+      "grad_norm": 0.32945147104457057,
+      "learning_rate": 0.00011533113233250911,
+      "loss": 0.5852,
+      "step": 1461
+    },
+    {
+      "epoch": 0.46784,
+      "grad_norm": 0.3516200094308783,
+      "learning_rate": 0.0001152287007388471,
+      "loss": 0.6567,
+      "step": 1462
+    },
+    {
+      "epoch": 0.46816,
+      "grad_norm": 0.36357213082683193,
+      "learning_rate": 0.00011512625278490683,
+      "loss": 0.6643,
+      "step": 1463
+    },
+    {
+      "epoch": 0.46848,
+      "grad_norm": 0.43220321793729743,
+      "learning_rate": 0.00011502378858074869,
+      "loss": 0.614,
+      "step": 1464
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.34989334390190063,
+      "learning_rate": 0.00011492130823645056,
+      "loss": 0.6317,
+      "step": 1465
+    },
+    {
+      "epoch": 0.46912,
+      "grad_norm": 0.34628728884582216,
+      "learning_rate": 0.00011481881186210765,
+      "loss": 0.6291,
+      "step": 1466
+    },
+    {
+      "epoch": 0.46944,
+      "grad_norm": 0.3294803821248172,
+      "learning_rate": 0.00011471629956783239,
+      "loss": 0.6026,
+      "step": 1467
+    },
+    {
+      "epoch": 0.46976,
+      "grad_norm": 0.35180452368672444,
+      "learning_rate": 0.0001146137714637543,
+      "loss": 0.6349,
+      "step": 1468
+    },
+    {
+      "epoch": 0.47008,
+      "grad_norm": 0.3855999411847994,
+      "learning_rate": 0.00011451122766001987,
+      "loss": 0.6258,
+      "step": 1469
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.343574916980757,
+      "learning_rate": 0.00011440866826679254,
+      "loss": 0.6301,
+      "step": 1470
+    },
+    {
+      "epoch": 0.47072,
+      "grad_norm": 0.3523512323973055,
+      "learning_rate": 0.0001143060933942524,
+      "loss": 0.6183,
+      "step": 1471
+    },
+    {
+      "epoch": 0.47104,
+      "grad_norm": 0.39126196332369406,
+      "learning_rate": 0.00011420350315259622,
+      "loss": 0.6002,
+      "step": 1472
+    },
+    {
+      "epoch": 0.47136,
+      "grad_norm": 0.38312920437104464,
+      "learning_rate": 0.00011410089765203724,
+      "loss": 0.6444,
+      "step": 1473
+    },
+    {
+      "epoch": 0.47168,
+      "grad_norm": 0.3406849317885865,
+      "learning_rate": 0.00011399827700280519,
+      "loss": 0.6166,
+      "step": 1474
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.35156970174795626,
+      "learning_rate": 0.00011389564131514596,
+      "loss": 0.648,
+      "step": 1475
+    },
+    {
+      "epoch": 0.47232,
+      "grad_norm": 0.3622574065950502,
+      "learning_rate": 0.00011379299069932165,
+      "loss": 0.5963,
+      "step": 1476
+    },
+    {
+      "epoch": 0.47264,
+      "grad_norm": 0.3479842152568595,
+      "learning_rate": 0.00011369032526561039,
+      "loss": 0.5882,
+      "step": 1477
+    },
+    {
+      "epoch": 0.47296,
+      "grad_norm": 0.3329170605694178,
+      "learning_rate": 0.00011358764512430622,
+      "loss": 0.5978,
+      "step": 1478
+    },
+    {
+      "epoch": 0.47328,
+      "grad_norm": 0.3861496322294955,
+      "learning_rate": 0.000113484950385719,
+      "loss": 0.6236,
+      "step": 1479
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3529911339843644,
+      "learning_rate": 0.00011338224116017423,
+      "loss": 0.6618,
+      "step": 1480
+    },
+    {
+      "epoch": 0.47392,
+      "grad_norm": 0.3392940403089316,
+      "learning_rate": 0.00011327951755801307,
+      "loss": 0.6002,
+      "step": 1481
+    },
+    {
+      "epoch": 0.47424,
+      "grad_norm": 0.34836953650491914,
+      "learning_rate": 0.000113176779689592,
+      "loss": 0.6337,
+      "step": 1482
+    },
+    {
+      "epoch": 0.47456,
+      "grad_norm": 0.34293545389884234,
+      "learning_rate": 0.00011307402766528293,
+      "loss": 0.6376,
+      "step": 1483
+    },
+    {
+      "epoch": 0.47488,
+      "grad_norm": 0.34734856793025476,
+      "learning_rate": 0.0001129712615954729,
+      "loss": 0.5877,
+      "step": 1484
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.35392265174991977,
+      "learning_rate": 0.00011286848159056409,
+      "loss": 0.6429,
+      "step": 1485
+    },
+    {
+      "epoch": 0.47552,
+      "grad_norm": 0.3586802859631239,
+      "learning_rate": 0.0001127656877609736,
+      "loss": 0.6371,
+      "step": 1486
+    },
+    {
+      "epoch": 0.47584,
+      "grad_norm": 0.3891543667036396,
+      "learning_rate": 0.00011266288021713347,
+      "loss": 0.6663,
+      "step": 1487
+    },
+    {
+      "epoch": 0.47616,
+      "grad_norm": 0.34439856456847573,
+      "learning_rate": 0.00011256005906949041,
+      "loss": 0.6127,
+      "step": 1488
+    },
+    {
+      "epoch": 0.47648,
+      "grad_norm": 0.3634753787510308,
+      "learning_rate": 0.0001124572244285057,
+      "loss": 0.6373,
+      "step": 1489
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3406557813425304,
+      "learning_rate": 0.00011235437640465522,
+      "loss": 0.6313,
+      "step": 1490
+    },
+    {
+      "epoch": 0.47712,
+      "grad_norm": 0.3512969725436879,
+      "learning_rate": 0.00011225151510842917,
+      "loss": 0.6322,
+      "step": 1491
+    },
+    {
+      "epoch": 0.47744,
+      "grad_norm": 0.35554341878234125,
+      "learning_rate": 0.00011214864065033202,
+      "loss": 0.6267,
+      "step": 1492
+    },
+    {
+      "epoch": 0.47776,
+      "grad_norm": 0.34908971553433915,
+      "learning_rate": 0.00011204575314088233,
+      "loss": 0.6171,
+      "step": 1493
+    },
+    {
+      "epoch": 0.47808,
+      "grad_norm": 0.4069273132712551,
+      "learning_rate": 0.00011194285269061277,
+      "loss": 0.6421,
+      "step": 1494
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.34053587415385045,
+      "learning_rate": 0.00011183993941006983,
+      "loss": 0.6132,
+      "step": 1495
+    },
+    {
+      "epoch": 0.47872,
+      "grad_norm": 0.34548132317073693,
+      "learning_rate": 0.00011173701340981386,
+      "loss": 0.6181,
+      "step": 1496
+    },
+    {
+      "epoch": 0.47904,
+      "grad_norm": 0.33489113586140046,
+      "learning_rate": 0.00011163407480041885,
+      "loss": 0.563,
+      "step": 1497
+    },
+    {
+      "epoch": 0.47936,
+      "grad_norm": 0.3530909412452826,
+      "learning_rate": 0.0001115311236924723,
+      "loss": 0.6428,
+      "step": 1498
+    },
+    {
+      "epoch": 0.47968,
+      "grad_norm": 0.34212937452029313,
+      "learning_rate": 0.00011142816019657514,
+      "loss": 0.6316,
+      "step": 1499
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3821487759207229,
+      "learning_rate": 0.00011132518442334161,
+      "loss": 0.626,
+      "step": 1500
+    },
+    {
+      "epoch": 0.48032,
+      "grad_norm": 0.3373358912775132,
+      "learning_rate": 0.00011122219648339924,
+      "loss": 0.6226,
+      "step": 1501
+    },
+    {
+      "epoch": 0.48064,
+      "grad_norm": 0.342414496716076,
+      "learning_rate": 0.00011111919648738851,
+      "loss": 0.6716,
+      "step": 1502
+    },
+    {
+      "epoch": 0.48096,
+      "grad_norm": 0.3999028830058103,
+      "learning_rate": 0.00011101618454596287,
+      "loss": 0.6415,
+      "step": 1503
+    },
+    {
+      "epoch": 0.48128,
+      "grad_norm": 0.3407819948407285,
+      "learning_rate": 0.00011091316076978866,
+      "loss": 0.6295,
+      "step": 1504
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.34683982320339674,
+      "learning_rate": 0.00011081012526954486,
+      "loss": 0.6432,
+      "step": 1505
+    },
+    {
+      "epoch": 0.48192,
+      "grad_norm": 0.3742143463070764,
+      "learning_rate": 0.0001107070781559231,
+      "loss": 0.6295,
+      "step": 1506
+    },
+    {
+      "epoch": 0.48224,
+      "grad_norm": 0.34628225365856147,
+      "learning_rate": 0.00011060401953962748,
+      "loss": 0.6281,
+      "step": 1507
+    },
+    {
+      "epoch": 0.48256,
+      "grad_norm": 0.3506858097180935,
+      "learning_rate": 0.00011050094953137444,
+      "loss": 0.6436,
+      "step": 1508
+    },
+    {
+      "epoch": 0.48288,
+      "grad_norm": 0.34512210750820227,
+      "learning_rate": 0.00011039786824189263,
+      "loss": 0.6678,
+      "step": 1509
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.34878405937210083,
+      "learning_rate": 0.00011029477578192291,
+      "loss": 0.6399,
+      "step": 1510
+    },
+    {
+      "epoch": 0.48352,
+      "grad_norm": 0.36579689331874526,
+      "learning_rate": 0.00011019167226221808,
+      "loss": 0.6664,
+      "step": 1511
+    },
+    {
+      "epoch": 0.48384,
+      "grad_norm": 0.36442956929914655,
+      "learning_rate": 0.00011008855779354281,
+      "loss": 0.6621,
+      "step": 1512
+    },
+    {
+      "epoch": 0.48416,
+      "grad_norm": 0.3490222252368435,
+      "learning_rate": 0.00010998543248667352,
+      "loss": 0.6165,
+      "step": 1513
+    },
+    {
+      "epoch": 0.48448,
+      "grad_norm": 0.3293433339691941,
+      "learning_rate": 0.00010988229645239836,
+      "loss": 0.6435,
+      "step": 1514
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.3593454046175193,
+      "learning_rate": 0.00010977914980151691,
+      "loss": 0.6223,
+      "step": 1515
+    },
+    {
+      "epoch": 0.48512,
+      "grad_norm": 0.3714878582059504,
+      "learning_rate": 0.00010967599264484024,
+      "loss": 0.6899,
+      "step": 1516
+    },
+    {
+      "epoch": 0.48544,
+      "grad_norm": 0.3390951918633382,
+      "learning_rate": 0.00010957282509319056,
+      "loss": 0.5989,
+      "step": 1517
+    },
+    {
+      "epoch": 0.48576,
+      "grad_norm": 0.3650563201215915,
+      "learning_rate": 0.00010946964725740145,
+      "loss": 0.6377,
+      "step": 1518
+    },
+    {
+      "epoch": 0.48608,
+      "grad_norm": 0.3384077845207565,
+      "learning_rate": 0.0001093664592483174,
+      "loss": 0.6301,
+      "step": 1519
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3365703675980268,
+      "learning_rate": 0.00010926326117679388,
+      "loss": 0.6149,
+      "step": 1520
+    },
+    {
+      "epoch": 0.48672,
+      "grad_norm": 0.3509719189816956,
+      "learning_rate": 0.00010916005315369713,
+      "loss": 0.6384,
+      "step": 1521
+    },
+    {
+      "epoch": 0.48704,
+      "grad_norm": 0.34555108446779503,
+      "learning_rate": 0.00010905683528990406,
+      "loss": 0.5871,
+      "step": 1522
+    },
+    {
+      "epoch": 0.48736,
+      "grad_norm": 0.33944086739768,
+      "learning_rate": 0.0001089536076963023,
+      "loss": 0.614,
+      "step": 1523
+    },
+    {
+      "epoch": 0.48768,
+      "grad_norm": 0.3470535535643965,
+      "learning_rate": 0.00010885037048378977,
+      "loss": 0.6223,
+      "step": 1524
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.36763022084148006,
+      "learning_rate": 0.00010874712376327481,
+      "loss": 0.6508,
+      "step": 1525
+    },
+    {
+      "epoch": 0.48832,
+      "grad_norm": 0.35657647304209655,
+      "learning_rate": 0.00010864386764567588,
+      "loss": 0.6309,
+      "step": 1526
+    },
+    {
+      "epoch": 0.48864,
+      "grad_norm": 0.3591461764707076,
+      "learning_rate": 0.00010854060224192171,
+      "loss": 0.6358,
+      "step": 1527
+    },
+    {
+      "epoch": 0.48896,
+      "grad_norm": 0.48561248508741794,
+      "learning_rate": 0.0001084373276629508,
+      "loss": 0.6348,
+      "step": 1528
+    },
+    {
+      "epoch": 0.48928,
+      "grad_norm": 0.3937691425705274,
+      "learning_rate": 0.0001083340440197117,
+      "loss": 0.6763,
+      "step": 1529
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3694699208895203,
+      "learning_rate": 0.00010823075142316254,
+      "loss": 0.6427,
+      "step": 1530
+    },
+    {
+      "epoch": 0.48992,
+      "grad_norm": 0.3340047751988796,
+      "learning_rate": 0.00010812744998427113,
+      "loss": 0.687,
+      "step": 1531
+    },
+    {
+      "epoch": 0.49024,
+      "grad_norm": 0.3189005679326198,
+      "learning_rate": 0.00010802413981401483,
+      "loss": 0.5775,
+      "step": 1532
+    },
+    {
+      "epoch": 0.49056,
+      "grad_norm": 0.3325406920938598,
+      "learning_rate": 0.0001079208210233803,
+      "loss": 0.6136,
+      "step": 1533
+    },
+    {
+      "epoch": 0.49088,
+      "grad_norm": 0.3355965863247086,
+      "learning_rate": 0.00010781749372336352,
+      "loss": 0.6077,
+      "step": 1534
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.33799882207111426,
+      "learning_rate": 0.00010771415802496955,
+      "loss": 0.6723,
+      "step": 1535
+    },
+    {
+      "epoch": 0.49152,
+      "grad_norm": 0.3363724588202362,
+      "learning_rate": 0.00010761081403921254,
+      "loss": 0.6255,
+      "step": 1536
+    },
+    {
+      "epoch": 0.49184,
+      "grad_norm": 0.34712555670684897,
+      "learning_rate": 0.00010750746187711549,
+      "loss": 0.5984,
+      "step": 1537
+    },
+    {
+      "epoch": 0.49216,
+      "grad_norm": 0.35267736141035483,
+      "learning_rate": 0.00010740410164971019,
+      "loss": 0.654,
+      "step": 1538
+    },
+    {
+      "epoch": 0.49248,
+      "grad_norm": 0.342106239419953,
+      "learning_rate": 0.00010730073346803713,
+      "loss": 0.5809,
+      "step": 1539
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.35593989300415674,
+      "learning_rate": 0.00010719735744314534,
+      "loss": 0.6922,
+      "step": 1540
+    },
+    {
+      "epoch": 0.49312,
+      "grad_norm": 0.337651959706354,
+      "learning_rate": 0.00010709397368609227,
+      "loss": 0.636,
+      "step": 1541
+    },
+    {
+      "epoch": 0.49344,
+      "grad_norm": 0.33531385524069957,
+      "learning_rate": 0.00010699058230794361,
+      "loss": 0.6228,
+      "step": 1542
+    },
+    {
+      "epoch": 0.49376,
+      "grad_norm": 0.3859082536656671,
+      "learning_rate": 0.00010688718341977336,
+      "loss": 0.6683,
+      "step": 1543
+    },
+    {
+      "epoch": 0.49408,
+      "grad_norm": 0.3485561180895661,
+      "learning_rate": 0.00010678377713266345,
+      "loss": 0.5995,
+      "step": 1544
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.34372235738037266,
+      "learning_rate": 0.0001066803635577039,
+      "loss": 0.5981,
+      "step": 1545
+    },
+    {
+      "epoch": 0.49472,
+      "grad_norm": 0.33920912386132973,
+      "learning_rate": 0.00010657694280599249,
+      "loss": 0.6103,
+      "step": 1546
+    },
+    {
+      "epoch": 0.49504,
+      "grad_norm": 0.3414769670888569,
+      "learning_rate": 0.00010647351498863464,
+      "loss": 0.6016,
+      "step": 1547
+    },
+    {
+      "epoch": 0.49536,
+      "grad_norm": 0.3428845304428137,
+      "learning_rate": 0.00010637008021674351,
+      "loss": 0.6115,
+      "step": 1548
+    },
+    {
+      "epoch": 0.49568,
+      "grad_norm": 0.3444076882469626,
+      "learning_rate": 0.00010626663860143962,
+      "loss": 0.6308,
+      "step": 1549
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.35318799940328427,
+      "learning_rate": 0.00010616319025385089,
+      "loss": 0.6047,
+      "step": 1550
+    },
+    {
+      "epoch": 0.49632,
+      "grad_norm": 0.3552816630964503,
+      "learning_rate": 0.00010605973528511241,
+      "loss": 0.6381,
+      "step": 1551
+    },
+    {
+      "epoch": 0.49664,
+      "grad_norm": 0.3686555284800937,
+      "learning_rate": 0.0001059562738063665,
+      "loss": 0.6443,
+      "step": 1552
+    },
+    {
+      "epoch": 0.49696,
+      "grad_norm": 0.37288971430522566,
+      "learning_rate": 0.00010585280592876233,
+      "loss": 0.6661,
+      "step": 1553
+    },
+    {
+      "epoch": 0.49728,
+      "grad_norm": 0.34161361243175975,
+      "learning_rate": 0.00010574933176345607,
+      "loss": 0.6363,
+      "step": 1554
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.35552535199216256,
+      "learning_rate": 0.0001056458514216106,
+      "loss": 0.6575,
+      "step": 1555
+    },
+    {
+      "epoch": 0.49792,
+      "grad_norm": 0.3475335229702711,
+      "learning_rate": 0.0001055423650143954,
+      "loss": 0.6406,
+      "step": 1556
+    },
+    {
+      "epoch": 0.49824,
+      "grad_norm": 0.3574275794821226,
+      "learning_rate": 0.00010543887265298651,
+      "loss": 0.6583,
+      "step": 1557
+    },
+    {
+      "epoch": 0.49856,
+      "grad_norm": 0.3405013725914652,
+      "learning_rate": 0.00010533537444856636,
+      "loss": 0.5741,
+      "step": 1558
+    },
+    {
+      "epoch": 0.49888,
+      "grad_norm": 0.32599557351795333,
+      "learning_rate": 0.00010523187051232361,
+      "loss": 0.5653,
+      "step": 1559
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.35905268027963827,
+      "learning_rate": 0.00010512836095545318,
+      "loss": 0.6315,
+      "step": 1560
+    },
+    {
+      "epoch": 0.49952,
+      "grad_norm": 0.34767210200709264,
+      "learning_rate": 0.00010502484588915591,
+      "loss": 0.6434,
+      "step": 1561
+    },
+    {
+      "epoch": 0.49984,
+      "grad_norm": 0.34094109716070703,
+      "learning_rate": 0.00010492132542463866,
+      "loss": 0.6337,
+      "step": 1562
+    },
+    {
+      "epoch": 0.50016,
+      "grad_norm": 0.35053766408601883,
+      "learning_rate": 0.000104817799673114,
+      "loss": 0.6426,
+      "step": 1563
+    },
+    {
+      "epoch": 0.50048,
+      "grad_norm": 0.35446862941735086,
+      "learning_rate": 0.00010471426874580023,
+      "loss": 0.6024,
+      "step": 1564
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.3514784473335216,
+      "learning_rate": 0.00010461073275392124,
+      "loss": 0.6033,
+      "step": 1565
+    },
+    {
+      "epoch": 0.50112,
+      "grad_norm": 0.3518396437748681,
+      "learning_rate": 0.00010450719180870625,
+      "loss": 0.6208,
+      "step": 1566
+    },
+    {
+      "epoch": 0.50144,
+      "grad_norm": 0.33915986199309217,
+      "learning_rate": 0.00010440364602138997,
+      "loss": 0.6363,
+      "step": 1567
+    },
+    {
+      "epoch": 0.50176,
+      "grad_norm": 0.32840816260361627,
+      "learning_rate": 0.00010430009550321216,
+      "loss": 0.6207,
+      "step": 1568
+    },
+    {
+      "epoch": 0.50208,
+      "grad_norm": 0.34851701116767964,
+      "learning_rate": 0.00010419654036541773,
+      "loss": 0.6527,
+      "step": 1569
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3526953113108774,
+      "learning_rate": 0.0001040929807192565,
+      "loss": 0.6297,
+      "step": 1570
+    },
+    {
+      "epoch": 0.50272,
+      "grad_norm": 0.3406735973567741,
+      "learning_rate": 0.00010398941667598328,
+      "loss": 0.6308,
+      "step": 1571
+    },
+    {
+      "epoch": 0.50304,
+      "grad_norm": 0.34907120340842374,
+      "learning_rate": 0.00010388584834685744,
+      "loss": 0.6219,
+      "step": 1572
+    },
+    {
+      "epoch": 0.50336,
+      "grad_norm": 0.34831451268479496,
+      "learning_rate": 0.000103782275843143,
+      "loss": 0.6516,
+      "step": 1573
+    },
+    {
+      "epoch": 0.50368,
+      "grad_norm": 0.33467870310641157,
+      "learning_rate": 0.00010367869927610849,
+      "loss": 0.618,
+      "step": 1574
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.34355951534504026,
+      "learning_rate": 0.0001035751187570268,
+      "loss": 0.6188,
+      "step": 1575
+    },
+    {
+      "epoch": 0.50432,
+      "grad_norm": 0.3456240834891537,
+      "learning_rate": 0.0001034715343971751,
+      "loss": 0.6338,
+      "step": 1576
+    },
+    {
+      "epoch": 0.50464,
+      "grad_norm": 0.33625577000451945,
+      "learning_rate": 0.00010336794630783457,
+      "loss": 0.648,
+      "step": 1577
+    },
+    {
+      "epoch": 0.50496,
+      "grad_norm": 0.37623334603388203,
+      "learning_rate": 0.00010326435460029052,
+      "loss": 0.6795,
+      "step": 1578
+    },
+    {
+      "epoch": 0.50528,
+      "grad_norm": 0.34629085630422224,
+      "learning_rate": 0.00010316075938583206,
+      "loss": 0.6097,
+      "step": 1579
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.34475625719241837,
+      "learning_rate": 0.00010305716077575215,
+      "loss": 0.6303,
+      "step": 1580
+    },
+    {
+      "epoch": 0.50592,
+      "grad_norm": 0.3480966711092534,
+      "learning_rate": 0.00010295355888134737,
+      "loss": 0.6121,
+      "step": 1581
+    },
+    {
+      "epoch": 0.50624,
+      "grad_norm": 0.32857632761879896,
+      "learning_rate": 0.00010284995381391776,
+      "loss": 0.579,
+      "step": 1582
+    },
+    {
+      "epoch": 0.50656,
+      "grad_norm": 0.3671680513839694,
+      "learning_rate": 0.00010274634568476687,
+      "loss": 0.6944,
+      "step": 1583
+    },
+    {
+      "epoch": 0.50688,
+      "grad_norm": 0.33882969595327056,
+      "learning_rate": 0.00010264273460520144,
+      "loss": 0.6094,
+      "step": 1584
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.34246469015874414,
+      "learning_rate": 0.00010253912068653146,
+      "loss": 0.6132,
+      "step": 1585
+    },
+    {
+      "epoch": 0.50752,
+      "grad_norm": 0.3430856300357972,
+      "learning_rate": 0.00010243550404006998,
+      "loss": 0.6277,
+      "step": 1586
+    },
+    {
+      "epoch": 0.50784,
+      "grad_norm": 0.3637497325407064,
+      "learning_rate": 0.00010233188477713289,
+      "loss": 0.5708,
+      "step": 1587
+    },
+    {
+      "epoch": 0.50816,
+      "grad_norm": 0.36489311092522864,
+      "learning_rate": 0.00010222826300903896,
+      "loss": 0.6436,
+      "step": 1588
+    },
+    {
+      "epoch": 0.50848,
+      "grad_norm": 0.347896629202345,
+      "learning_rate": 0.00010212463884710963,
+      "loss": 0.6708,
+      "step": 1589
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3987583220077781,
+      "learning_rate": 0.00010202101240266893,
+      "loss": 0.6293,
+      "step": 1590
+    },
+    {
+      "epoch": 0.50912,
+      "grad_norm": 0.34133588390452735,
+      "learning_rate": 0.00010191738378704332,
+      "loss": 0.6026,
+      "step": 1591
+    },
+    {
+      "epoch": 0.50944,
+      "grad_norm": 0.34308336912178355,
+      "learning_rate": 0.00010181375311156157,
+      "loss": 0.6212,
+      "step": 1592
+    },
+    {
+      "epoch": 0.50976,
+      "grad_norm": 0.3546976501552943,
+      "learning_rate": 0.00010171012048755472,
+      "loss": 0.6723,
+      "step": 1593
+    },
+    {
+      "epoch": 0.51008,
+      "grad_norm": 0.3452054980769207,
+      "learning_rate": 0.0001016064860263559,
+      "loss": 0.6097,
+      "step": 1594
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3634600412889473,
+      "learning_rate": 0.00010150284983930016,
+      "loss": 0.6639,
+      "step": 1595
+    },
+    {
+      "epoch": 0.51072,
+      "grad_norm": 0.3790210411846465,
+      "learning_rate": 0.00010139921203772446,
+      "loss": 0.6021,
+      "step": 1596
+    },
+    {
+      "epoch": 0.51104,
+      "grad_norm": 0.36305368367853424,
+      "learning_rate": 0.00010129557273296741,
+      "loss": 0.6586,
+      "step": 1597
+    },
+    {
+      "epoch": 0.51136,
+      "grad_norm": 0.34875358539597456,
+      "learning_rate": 0.00010119193203636939,
+      "loss": 0.6376,
+      "step": 1598
+    },
+    {
+      "epoch": 0.51168,
+      "grad_norm": 0.35160684463697656,
+      "learning_rate": 0.0001010882900592721,
+      "loss": 0.6172,
+      "step": 1599
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3510462536991429,
+      "learning_rate": 0.00010098464691301873,
+      "loss": 0.6245,
+      "step": 1600
+    },
+    {
+      "epoch": 0.51232,
+      "grad_norm": 0.3517323450812867,
+      "learning_rate": 0.00010088100270895364,
+      "loss": 0.6318,
+      "step": 1601
+    },
+    {
+      "epoch": 0.51264,
+      "grad_norm": 0.34741836930236275,
+      "learning_rate": 0.00010077735755842249,
+      "loss": 0.6483,
+      "step": 1602
+    },
+    {
+      "epoch": 0.51296,
+      "grad_norm": 0.3445827873634932,
+      "learning_rate": 0.00010067371157277172,
+      "loss": 0.6363,
+      "step": 1603
+    },
+    {
+      "epoch": 0.51328,
+      "grad_norm": 0.340521729146734,
+      "learning_rate": 0.00010057006486334886,
+      "loss": 0.6328,
+      "step": 1604
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.3442250946520639,
+      "learning_rate": 0.00010046641754150214,
+      "loss": 0.5945,
+      "step": 1605
+    },
+    {
+      "epoch": 0.51392,
+      "grad_norm": 0.34561763070868484,
+      "learning_rate": 0.00010036276971858043,
+      "loss": 0.6182,
+      "step": 1606
+    },
+    {
+      "epoch": 0.51424,
+      "grad_norm": 0.6792569101809796,
+      "learning_rate": 0.0001002591215059332,
+      "loss": 0.6232,
+      "step": 1607
+    },
+    {
+      "epoch": 0.51456,
+      "grad_norm": 0.32707872563052687,
+      "learning_rate": 0.00010015547301491029,
+      "loss": 0.6305,
+      "step": 1608
+    },
+    {
+      "epoch": 0.51488,
+      "grad_norm": 0.3418782185323137,
+      "learning_rate": 0.00010005182435686185,
+      "loss": 0.6282,
+      "step": 1609
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3277883216323344,
+      "learning_rate": 9.994817564313819e-05,
+      "loss": 0.5916,
+      "step": 1610
+    },
+    {
+      "epoch": 0.51552,
+      "grad_norm": 0.3758405996209342,
+      "learning_rate": 9.984452698508976e-05,
+      "loss": 0.5882,
+      "step": 1611
+    },
+    {
+      "epoch": 0.51584,
+      "grad_norm": 0.35309424359566505,
+      "learning_rate": 9.974087849406683e-05,
+      "loss": 0.6298,
+      "step": 1612
+    },
+    {
+      "epoch": 0.51616,
+      "grad_norm": 0.35958186876854126,
+      "learning_rate": 9.963723028141958e-05,
+      "loss": 0.6351,
+      "step": 1613
+    },
+    {
+      "epoch": 0.51648,
+      "grad_norm": 0.35877317848063567,
+      "learning_rate": 9.953358245849791e-05,
+      "loss": 0.593,
+      "step": 1614
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3470855461805342,
+      "learning_rate": 9.942993513665115e-05,
+      "loss": 0.6275,
+      "step": 1615
+    },
+    {
+      "epoch": 0.51712,
+      "grad_norm": 0.35562411903011554,
+      "learning_rate": 9.932628842722833e-05,
+      "loss": 0.6568,
+      "step": 1616
+    },
+    {
+      "epoch": 0.51744,
+      "grad_norm": 0.3374542824126831,
+      "learning_rate": 9.922264244157755e-05,
+      "loss": 0.6127,
+      "step": 1617
+    },
+    {
+      "epoch": 0.51776,
+      "grad_norm": 0.4641883257339624,
+      "learning_rate": 9.911899729104636e-05,
+      "loss": 0.6304,
+      "step": 1618
+    },
+    {
+      "epoch": 0.51808,
+      "grad_norm": 0.36001288775377394,
+      "learning_rate": 9.901535308698134e-05,
+      "loss": 0.6493,
+      "step": 1619
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.34512170873528053,
+      "learning_rate": 9.891170994072793e-05,
+      "loss": 0.6653,
+      "step": 1620
+    },
+    {
+      "epoch": 0.51872,
+      "grad_norm": 0.34124183509899075,
+      "learning_rate": 9.880806796363062e-05,
+      "loss": 0.6237,
+      "step": 1621
+    },
+    {
+      "epoch": 0.51904,
+      "grad_norm": 0.35446878909065016,
+      "learning_rate": 9.870442726703261e-05,
+      "loss": 0.6246,
+      "step": 1622
+    },
+    {
+      "epoch": 0.51936,
+      "grad_norm": 0.3255396906990857,
+      "learning_rate": 9.860078796227556e-05,
+      "loss": 0.5546,
+      "step": 1623
+    },
+    {
+      "epoch": 0.51968,
+      "grad_norm": 0.3243820177044248,
+      "learning_rate": 9.849715016069986e-05,
+      "loss": 0.639,
+      "step": 1624
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3497987341346313,
+      "learning_rate": 9.839351397364411e-05,
+      "loss": 0.5908,
+      "step": 1625
+    },
+    {
+      "epoch": 0.52032,
+      "grad_norm": 0.3611728666287869,
+      "learning_rate": 9.828987951244528e-05,
+      "loss": 0.6432,
+      "step": 1626
+    },
+    {
+      "epoch": 0.52064,
+      "grad_norm": 0.3514356489710369,
+      "learning_rate": 9.818624688843846e-05,
+      "loss": 0.6276,
+      "step": 1627
+    },
+    {
+      "epoch": 0.52096,
+      "grad_norm": 0.34513254592759046,
+      "learning_rate": 9.808261621295672e-05,
+      "loss": 0.6102,
+      "step": 1628
+    },
+    {
+      "epoch": 0.52128,
+      "grad_norm": 0.33850586628037443,
+      "learning_rate": 9.79789875973311e-05,
+      "loss": 0.6,
+      "step": 1629
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3518783954718028,
+      "learning_rate": 9.787536115289038e-05,
+      "loss": 0.6132,
+      "step": 1630
+    },
+    {
+      "epoch": 0.52192,
+      "grad_norm": 0.3375530884733763,
+      "learning_rate": 9.777173699096107e-05,
+      "loss": 0.5779,
+      "step": 1631
+    },
+    {
+      "epoch": 0.52224,
+      "grad_norm": 0.3405406523913444,
+      "learning_rate": 9.766811522286712e-05,
+      "loss": 0.6513,
+      "step": 1632
+    },
+    {
+      "epoch": 0.52256,
+      "grad_norm": 0.3618874965518611,
+      "learning_rate": 9.756449595993004e-05,
+      "loss": 0.6352,
+      "step": 1633
+    },
+    {
+      "epoch": 0.52288,
+      "grad_norm": 0.36369634483609947,
+      "learning_rate": 9.746087931346852e-05,
+      "loss": 0.6321,
+      "step": 1634
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.3338864944394306,
+      "learning_rate": 9.73572653947986e-05,
+      "loss": 0.5672,
+      "step": 1635
+    },
+    {
+      "epoch": 0.52352,
+      "grad_norm": 0.34625940166135283,
+      "learning_rate": 9.725365431523315e-05,
+      "loss": 0.6156,
+      "step": 1636
+    },
+    {
+      "epoch": 0.52384,
+      "grad_norm": 0.3340723236290614,
+      "learning_rate": 9.715004618608228e-05,
+      "loss": 0.6129,
+      "step": 1637
+    },
+    {
+      "epoch": 0.52416,
+      "grad_norm": 0.3402868286590741,
+      "learning_rate": 9.704644111865265e-05,
+      "loss": 0.6148,
+      "step": 1638
+    },
+    {
+      "epoch": 0.52448,
+      "grad_norm": 0.3459932283985027,
+      "learning_rate": 9.694283922424784e-05,
+      "loss": 0.6356,
+      "step": 1639
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.33908552214619375,
+      "learning_rate": 9.683924061416797e-05,
+      "loss": 0.6038,
+      "step": 1640
+    },
+    {
+      "epoch": 0.52512,
+      "grad_norm": 0.3572073149458451,
+      "learning_rate": 9.673564539970951e-05,
+      "loss": 0.6583,
+      "step": 1641
+    },
+    {
+      "epoch": 0.52544,
+      "grad_norm": 0.3643586303290633,
+      "learning_rate": 9.663205369216548e-05,
+      "loss": 0.6052,
+      "step": 1642
+    },
+    {
+      "epoch": 0.52576,
+      "grad_norm": 0.3388210109305512,
+      "learning_rate": 9.652846560282494e-05,
+      "loss": 0.5858,
+      "step": 1643
+    },
+    {
+      "epoch": 0.52608,
+      "grad_norm": 0.3550710451875338,
+      "learning_rate": 9.64248812429732e-05,
+      "loss": 0.6887,
+      "step": 1644
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.379058341960421,
+      "learning_rate": 9.632130072389152e-05,
+      "loss": 0.6832,
+      "step": 1645
+    },
+    {
+      "epoch": 0.52672,
+      "grad_norm": 0.33756503615840416,
+      "learning_rate": 9.621772415685703e-05,
+      "loss": 0.6563,
+      "step": 1646
+    },
+    {
+      "epoch": 0.52704,
+      "grad_norm": 0.3514272125488278,
+      "learning_rate": 9.61141516531426e-05,
+      "loss": 0.6819,
+      "step": 1647
+    },
+    {
+      "epoch": 0.52736,
+      "grad_norm": 0.3621922446497168,
+      "learning_rate": 9.601058332401673e-05,
+      "loss": 0.6365,
+      "step": 1648
+    },
+    {
+      "epoch": 0.52768,
+      "grad_norm": 0.3521641110158746,
+      "learning_rate": 9.590701928074348e-05,
+      "loss": 0.6375,
+      "step": 1649
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.34077196030814916,
+      "learning_rate": 9.580345963458233e-05,
+      "loss": 0.6038,
+      "step": 1650
+    },
+    {
+      "epoch": 0.52832,
+      "grad_norm": 0.3435477626400518,
+      "learning_rate": 9.569990449678787e-05,
+      "loss": 0.6136,
+      "step": 1651
+    },
+    {
+      "epoch": 0.52864,
+      "grad_norm": 0.3483526386704811,
+      "learning_rate": 9.559635397861004e-05,
+      "loss": 0.6346,
+      "step": 1652
+    },
+    {
+      "epoch": 0.52896,
+      "grad_norm": 0.3296478462200726,
+      "learning_rate": 9.549280819129377e-05,
+      "loss": 0.644,
+      "step": 1653
+    },
+    {
+      "epoch": 0.52928,
+      "grad_norm": 0.35505382681576264,
+      "learning_rate": 9.53892672460788e-05,
+      "loss": 0.6001,
+      "step": 1654
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.37060064973685763,
+      "learning_rate": 9.52857312541998e-05,
+      "loss": 0.6296,
+      "step": 1655
+    },
+    {
+      "epoch": 0.52992,
+      "grad_norm": 0.35545535495174535,
+      "learning_rate": 9.518220032688603e-05,
+      "loss": 0.6013,
+      "step": 1656
+    },
+    {
+      "epoch": 0.53024,
+      "grad_norm": 0.36337776428354174,
+      "learning_rate": 9.507867457536138e-05,
+      "loss": 0.6843,
+      "step": 1657
+    },
+    {
+      "epoch": 0.53056,
+      "grad_norm": 0.367860694950824,
+      "learning_rate": 9.49751541108441e-05,
+      "loss": 0.6435,
+      "step": 1658
+    },
+    {
+      "epoch": 0.53088,
+      "grad_norm": 0.39661411613344955,
+      "learning_rate": 9.487163904454685e-05,
+      "loss": 0.6509,
+      "step": 1659
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.34600133675729755,
+      "learning_rate": 9.47681294876764e-05,
+      "loss": 0.6312,
+      "step": 1660
+    },
+    {
+      "epoch": 0.53152,
+      "grad_norm": 0.3456184877652772,
+      "learning_rate": 9.466462555143368e-05,
+      "loss": 0.63,
+      "step": 1661
+    },
+    {
+      "epoch": 0.53184,
+      "grad_norm": 0.3420147724316903,
+      "learning_rate": 9.456112734701349e-05,
+      "loss": 0.6298,
+      "step": 1662
+    },
+    {
+      "epoch": 0.53216,
+      "grad_norm": 0.33553414127144215,
+      "learning_rate": 9.445763498560463e-05,
+      "loss": 0.5536,
+      "step": 1663
+    },
+    {
+      "epoch": 0.53248,
+      "grad_norm": 0.33471678110632297,
+      "learning_rate": 9.435414857838942e-05,
+      "loss": 0.6177,
+      "step": 1664
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.34285722592754014,
+      "learning_rate": 9.425066823654393e-05,
+      "loss": 0.6187,
+      "step": 1665
+    },
+    {
+      "epoch": 0.53312,
+      "grad_norm": 0.336502798863346,
+      "learning_rate": 9.41471940712377e-05,
+      "loss": 0.631,
+      "step": 1666
+    },
+    {
+      "epoch": 0.53344,
+      "grad_norm": 0.3672435941706159,
+      "learning_rate": 9.404372619363353e-05,
+      "loss": 0.6211,
+      "step": 1667
+    },
+    {
+      "epoch": 0.53376,
+      "grad_norm": 0.3290893712675334,
+      "learning_rate": 9.394026471488762e-05,
+      "loss": 0.5746,
+      "step": 1668
+    },
+    {
+      "epoch": 0.53408,
+      "grad_norm": 0.3558363473580683,
+      "learning_rate": 9.383680974614915e-05,
+      "loss": 0.6478,
+      "step": 1669
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.33461305565651456,
+      "learning_rate": 9.373336139856039e-05,
+      "loss": 0.6204,
+      "step": 1670
+    },
+    {
+      "epoch": 0.53472,
+      "grad_norm": 0.3572442378059015,
+      "learning_rate": 9.36299197832565e-05,
+      "loss": 0.6767,
+      "step": 1671
+    },
+    {
+      "epoch": 0.53504,
+      "grad_norm": 0.34831788962831867,
+      "learning_rate": 9.352648501136538e-05,
+      "loss": 0.6407,
+      "step": 1672
+    },
+    {
+      "epoch": 0.53536,
+      "grad_norm": 0.3466679800660956,
+      "learning_rate": 9.342305719400755e-05,
+      "loss": 0.5952,
+      "step": 1673
+    },
+    {
+      "epoch": 0.53568,
+      "grad_norm": 0.36002830709514616,
+      "learning_rate": 9.331963644229611e-05,
+      "loss": 0.6099,
+      "step": 1674
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3383518386169015,
+      "learning_rate": 9.321622286733655e-05,
+      "loss": 0.5839,
+      "step": 1675
+    },
+    {
+      "epoch": 0.53632,
+      "grad_norm": 0.3436071802430533,
+      "learning_rate": 9.31128165802267e-05,
+      "loss": 0.5881,
+      "step": 1676
+    },
+    {
+      "epoch": 0.53664,
+      "grad_norm": 0.3385875097113795,
+      "learning_rate": 9.30094176920564e-05,
+      "loss": 0.6477,
+      "step": 1677
+    },
+    {
+      "epoch": 0.53696,
+      "grad_norm": 0.34485333697226567,
+      "learning_rate": 9.290602631390774e-05,
+      "loss": 0.615,
+      "step": 1678
+    },
+    {
+      "epoch": 0.53728,
+      "grad_norm": 0.34265052061935997,
+      "learning_rate": 9.280264255685467e-05,
+      "loss": 0.642,
+      "step": 1679
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.340332040773954,
+      "learning_rate": 9.269926653196286e-05,
+      "loss": 0.6265,
+      "step": 1680
+    },
+    {
+      "epoch": 0.53792,
+      "grad_norm": 0.35983120342480945,
+      "learning_rate": 9.259589835028985e-05,
+      "loss": 0.5691,
+      "step": 1681
+    },
+    {
+      "epoch": 0.53824,
+      "grad_norm": 0.3613563884457701,
+      "learning_rate": 9.249253812288454e-05,
+      "loss": 0.6326,
+      "step": 1682
+    },
+    {
+      "epoch": 0.53856,
+      "grad_norm": 0.3347482561034258,
+      "learning_rate": 9.238918596078746e-05,
+      "loss": 0.6058,
+      "step": 1683
+    },
+    {
+      "epoch": 0.53888,
+      "grad_norm": 0.33377818378359564,
+      "learning_rate": 9.228584197503047e-05,
+      "loss": 0.6075,
+      "step": 1684
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.34869157014223023,
+      "learning_rate": 9.21825062766365e-05,
+      "loss": 0.6221,
+      "step": 1685
+    },
+    {
+      "epoch": 0.53952,
+      "grad_norm": 0.3550502704944002,
+      "learning_rate": 9.207917897661971e-05,
+      "loss": 0.5982,
+      "step": 1686
+    },
+    {
+      "epoch": 0.53984,
+      "grad_norm": 0.3642034198511041,
+      "learning_rate": 9.197586018598518e-05,
+      "loss": 0.6285,
+      "step": 1687
+    },
+    {
+      "epoch": 0.54016,
+      "grad_norm": 0.330179119689698,
+      "learning_rate": 9.187255001572886e-05,
+      "loss": 0.5946,
+      "step": 1688
+    },
+    {
+      "epoch": 0.54048,
+      "grad_norm": 0.3545109241322245,
+      "learning_rate": 9.17692485768375e-05,
+      "loss": 0.6497,
+      "step": 1689
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.33016124052617013,
+      "learning_rate": 9.166595598028832e-05,
+      "loss": 0.6192,
+      "step": 1690
+    },
+    {
+      "epoch": 0.54112,
+      "grad_norm": 0.3260067961830586,
+      "learning_rate": 9.156267233704922e-05,
+      "loss": 0.5986,
+      "step": 1691
+    },
+    {
+      "epoch": 0.54144,
+      "grad_norm": 0.3478611469874791,
+      "learning_rate": 9.145939775807833e-05,
+      "loss": 0.615,
+      "step": 1692
+    },
+    {
+      "epoch": 0.54176,
+      "grad_norm": 0.34285832246410863,
+      "learning_rate": 9.135613235432413e-05,
+      "loss": 0.6247,
+      "step": 1693
+    },
+    {
+      "epoch": 0.54208,
+      "grad_norm": 0.3657478913658266,
+      "learning_rate": 9.125287623672525e-05,
+      "loss": 0.6072,
+      "step": 1694
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3476063432549975,
+      "learning_rate": 9.114962951621024e-05,
+      "loss": 0.6605,
+      "step": 1695
+    },
+    {
+      "epoch": 0.54272,
+      "grad_norm": 0.34453039917363415,
+      "learning_rate": 9.104639230369769e-05,
+      "loss": 0.6025,
+      "step": 1696
+    },
+    {
+      "epoch": 0.54304,
+      "grad_norm": 0.3425646347782955,
+      "learning_rate": 9.094316471009596e-05,
+      "loss": 0.6119,
+      "step": 1697
+    },
+    {
+      "epoch": 0.54336,
+      "grad_norm": 0.32810171938466276,
+      "learning_rate": 9.083994684630289e-05,
+      "loss": 0.6159,
+      "step": 1698
+    },
+    {
+      "epoch": 0.54368,
+      "grad_norm": 0.33519412706874185,
+      "learning_rate": 9.073673882320615e-05,
+      "loss": 0.5634,
+      "step": 1699
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.36123679999363023,
+      "learning_rate": 9.063354075168262e-05,
+      "loss": 0.6588,
+      "step": 1700
+    },
+    {
+      "epoch": 0.54432,
+      "grad_norm": 0.3389970013734657,
+      "learning_rate": 9.053035274259855e-05,
+      "loss": 0.5957,
+      "step": 1701
+    },
+    {
+      "epoch": 0.54464,
+      "grad_norm": 0.3674292130209685,
+      "learning_rate": 9.042717490680946e-05,
+      "loss": 0.6384,
+      "step": 1702
+    },
+    {
+      "epoch": 0.54496,
+      "grad_norm": 0.36273330396477954,
+      "learning_rate": 9.03240073551598e-05,
+      "loss": 0.6003,
+      "step": 1703
+    },
+    {
+      "epoch": 0.54528,
+      "grad_norm": 0.3384260519568992,
+      "learning_rate": 9.022085019848314e-05,
+      "loss": 0.6247,
+      "step": 1704
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3455683340536113,
+      "learning_rate": 9.011770354760168e-05,
+      "loss": 0.6395,
+      "step": 1705
+    },
+    {
+      "epoch": 0.54592,
+      "grad_norm": 0.34367493334249694,
+      "learning_rate": 9.001456751332649e-05,
+      "loss": 0.6621,
+      "step": 1706
+    },
+    {
+      "epoch": 0.54624,
+      "grad_norm": 0.35644784070146923,
+      "learning_rate": 8.991144220645724e-05,
+      "loss": 0.6655,
+      "step": 1707
+    },
+    {
+      "epoch": 0.54656,
+      "grad_norm": 0.3534638713268805,
+      "learning_rate": 8.980832773778193e-05,
+      "loss": 0.6318,
+      "step": 1708
+    },
+    {
+      "epoch": 0.54688,
+      "grad_norm": 0.3409241916958398,
+      "learning_rate": 8.970522421807707e-05,
+      "loss": 0.609,
+      "step": 1709
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.35038036509249415,
+      "learning_rate": 8.960213175810738e-05,
+      "loss": 0.6631,
+      "step": 1710
+    },
+    {
+      "epoch": 0.54752,
+      "grad_norm": 0.35825908020786396,
+      "learning_rate": 8.94990504686256e-05,
+      "loss": 0.661,
+      "step": 1711
+    },
+    {
+      "epoch": 0.54784,
+      "grad_norm": 0.34664905650836964,
+      "learning_rate": 8.939598046037257e-05,
+      "loss": 0.6769,
+      "step": 1712
+    },
+    {
+      "epoch": 0.54816,
+      "grad_norm": 0.35853567678098225,
+      "learning_rate": 8.929292184407692e-05,
+      "loss": 0.6495,
+      "step": 1713
+    },
+    {
+      "epoch": 0.54848,
+      "grad_norm": 0.3174661452196107,
+      "learning_rate": 8.918987473045517e-05,
+      "loss": 0.6152,
+      "step": 1714
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.3495590900347859,
+      "learning_rate": 8.908683923021137e-05,
+      "loss": 0.637,
+      "step": 1715
+    },
+    {
+      "epoch": 0.54912,
+      "grad_norm": 0.33932804328146104,
+      "learning_rate": 8.898381545403714e-05,
+      "loss": 0.5833,
+      "step": 1716
+    },
+    {
+      "epoch": 0.54944,
+      "grad_norm": 0.38013174749029327,
+      "learning_rate": 8.888080351261154e-05,
+      "loss": 0.623,
+      "step": 1717
+    },
+    {
+      "epoch": 0.54976,
+      "grad_norm": 0.3633381849551441,
+      "learning_rate": 8.877780351660078e-05,
+      "loss": 0.6319,
+      "step": 1718
+    },
+    {
+      "epoch": 0.55008,
+      "grad_norm": 0.34351967998456356,
+      "learning_rate": 8.867481557665837e-05,
+      "loss": 0.5893,
+      "step": 1719
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.35912950579023756,
+      "learning_rate": 8.857183980342491e-05,
+      "loss": 0.5895,
+      "step": 1720
+    },
+    {
+      "epoch": 0.55072,
+      "grad_norm": 0.3599373228414958,
+      "learning_rate": 8.846887630752774e-05,
+      "loss": 0.5929,
+      "step": 1721
+    },
+    {
+      "epoch": 0.55104,
+      "grad_norm": 0.36908549605691704,
+      "learning_rate": 8.836592519958118e-05,
+      "loss": 0.6235,
+      "step": 1722
+    },
+    {
+      "epoch": 0.55136,
+      "grad_norm": 0.3565050343212028,
+      "learning_rate": 8.826298659018615e-05,
+      "loss": 0.6202,
+      "step": 1723
+    },
+    {
+      "epoch": 0.55168,
+      "grad_norm": 0.33184316237640576,
+      "learning_rate": 8.816006058993018e-05,
+      "loss": 0.6089,
+      "step": 1724
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3480609683269693,
+      "learning_rate": 8.805714730938728e-05,
+      "loss": 0.597,
+      "step": 1725
+    },
+    {
+      "epoch": 0.55232,
+      "grad_norm": 0.32641504693056705,
+      "learning_rate": 8.795424685911769e-05,
+      "loss": 0.6014,
+      "step": 1726
+    },
+    {
+      "epoch": 0.55264,
+      "grad_norm": 0.3370306759354407,
+      "learning_rate": 8.785135934966802e-05,
+      "loss": 0.5981,
+      "step": 1727
+    },
+    {
+      "epoch": 0.55296,
+      "grad_norm": 0.3524600351484558,
+      "learning_rate": 8.774848489157085e-05,
+      "loss": 0.6521,
+      "step": 1728
+    },
+    {
+      "epoch": 0.55328,
+      "grad_norm": 0.34147473912059895,
+      "learning_rate": 8.76456235953448e-05,
+      "loss": 0.5806,
+      "step": 1729
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.34229875747408445,
+      "learning_rate": 8.754277557149431e-05,
+      "loss": 0.6229,
+      "step": 1730
+    },
+    {
+      "epoch": 0.55392,
+      "grad_norm": 0.3556211593332454,
+      "learning_rate": 8.743994093050963e-05,
+      "loss": 0.6357,
+      "step": 1731
+    },
+    {
+      "epoch": 0.55424,
+      "grad_norm": 0.3459976872304538,
+      "learning_rate": 8.733711978286652e-05,
+      "loss": 0.606,
+      "step": 1732
+    },
+    {
+      "epoch": 0.55456,
+      "grad_norm": 0.32759608492010256,
+      "learning_rate": 8.723431223902642e-05,
+      "loss": 0.5674,
+      "step": 1733
+    },
+    {
+      "epoch": 0.55488,
+      "grad_norm": 0.3591738982655165,
+      "learning_rate": 8.713151840943593e-05,
+      "loss": 0.6368,
+      "step": 1734
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3548210898890839,
+      "learning_rate": 8.702873840452715e-05,
+      "loss": 0.6256,
+      "step": 1735
+    },
+    {
+      "epoch": 0.55552,
+      "grad_norm": 0.3420524668970695,
+      "learning_rate": 8.69259723347171e-05,
+      "loss": 0.5913,
+      "step": 1736
+    },
+    {
+      "epoch": 0.55584,
+      "grad_norm": 0.3677436150713398,
+      "learning_rate": 8.6823220310408e-05,
+      "loss": 0.6019,
+      "step": 1737
+    },
+    {
+      "epoch": 0.55616,
+      "grad_norm": 0.758768611701704,
+      "learning_rate": 8.672048244198696e-05,
+      "loss": 0.6071,
+      "step": 1738
+    },
+    {
+      "epoch": 0.55648,
+      "grad_norm": 0.4664334827903925,
+      "learning_rate": 8.661775883982578e-05,
+      "loss": 0.5909,
+      "step": 1739
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3512120492042207,
+      "learning_rate": 8.651504961428103e-05,
+      "loss": 0.5943,
+      "step": 1740
+    },
+    {
+      "epoch": 0.55712,
+      "grad_norm": 0.3494784468671295,
+      "learning_rate": 8.641235487569381e-05,
+      "loss": 0.6019,
+      "step": 1741
+    },
+    {
+      "epoch": 0.55744,
+      "grad_norm": 0.40159765671756575,
+      "learning_rate": 8.630967473438965e-05,
+      "loss": 0.6165,
+      "step": 1742
+    },
+    {
+      "epoch": 0.55776,
+      "grad_norm": 0.3516738509161463,
+      "learning_rate": 8.620700930067837e-05,
+      "loss": 0.6166,
+      "step": 1743
+    },
+    {
+      "epoch": 0.55808,
+      "grad_norm": 0.3453203645406318,
+      "learning_rate": 8.610435868485406e-05,
+      "loss": 0.6038,
+      "step": 1744
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.3478563315262649,
+      "learning_rate": 8.60017229971948e-05,
+      "loss": 0.6395,
+      "step": 1745
+    },
+    {
+      "epoch": 0.55872,
+      "grad_norm": 0.34843144616470656,
+      "learning_rate": 8.589910234796277e-05,
+      "loss": 0.6315,
+      "step": 1746
+    },
+    {
+      "epoch": 0.55904,
+      "grad_norm": 0.3614937658969523,
+      "learning_rate": 8.57964968474038e-05,
+      "loss": 0.6416,
+      "step": 1747
+    },
+    {
+      "epoch": 0.55936,
+      "grad_norm": 0.3514392345357859,
+      "learning_rate": 8.569390660574764e-05,
+      "loss": 0.5702,
+      "step": 1748
+    },
+    {
+      "epoch": 0.55968,
+      "grad_norm": 0.3775582118729312,
+      "learning_rate": 8.559133173320749e-05,
+      "loss": 0.6333,
+      "step": 1749
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.34841874680400164,
+      "learning_rate": 8.548877233998014e-05,
+      "loss": 0.6002,
+      "step": 1750
+    },
+    {
+      "epoch": 0.56032,
+      "grad_norm": 0.36064135754041476,
+      "learning_rate": 8.538622853624575e-05,
+      "loss": 0.6102,
+      "step": 1751
+    },
+    {
+      "epoch": 0.56064,
+      "grad_norm": 0.3498893351562969,
+      "learning_rate": 8.528370043216763e-05,
+      "loss": 0.6583,
+      "step": 1752
+    },
+    {
+      "epoch": 0.56096,
+      "grad_norm": 0.33541127767302875,
+      "learning_rate": 8.518118813789237e-05,
+      "loss": 0.6345,
+      "step": 1753
+    },
+    {
+      "epoch": 0.56128,
+      "grad_norm": 0.3444274881783159,
+      "learning_rate": 8.507869176354945e-05,
+      "loss": 0.5759,
+      "step": 1754
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.359571877500463,
+      "learning_rate": 8.497621141925134e-05,
+      "loss": 0.6181,
+      "step": 1755
+    },
+    {
+      "epoch": 0.56192,
+      "grad_norm": 0.3364808308963259,
+      "learning_rate": 8.48737472150932e-05,
+      "loss": 0.6301,
+      "step": 1756
+    },
+    {
+      "epoch": 0.56224,
+      "grad_norm": 0.35686018816248494,
+      "learning_rate": 8.477129926115292e-05,
+      "loss": 0.5969,
+      "step": 1757
+    },
+    {
+      "epoch": 0.56256,
+      "grad_norm": 0.3768640326394994,
+      "learning_rate": 8.46688676674909e-05,
+      "loss": 0.6699,
+      "step": 1758
+    },
+    {
+      "epoch": 0.56288,
+      "grad_norm": 0.3335654500656375,
+      "learning_rate": 8.456645254414998e-05,
+      "loss": 0.5696,
+      "step": 1759
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3311188794008257,
+      "learning_rate": 8.44640540011553e-05,
+      "loss": 0.6219,
+      "step": 1760
+    },
+    {
+      "epoch": 0.56352,
+      "grad_norm": 0.3687414880614699,
+      "learning_rate": 8.43616721485142e-05,
+      "loss": 0.6129,
+      "step": 1761
+    },
+    {
+      "epoch": 0.56384,
+      "grad_norm": 0.3605932703963821,
+      "learning_rate": 8.425930709621603e-05,
+      "loss": 0.6274,
+      "step": 1762
+    },
+    {
+      "epoch": 0.56416,
+      "grad_norm": 0.36229678089782624,
+      "learning_rate": 8.415695895423217e-05,
+      "loss": 0.6263,
+      "step": 1763
+    },
+    {
+      "epoch": 0.56448,
+      "grad_norm": 0.36506050857138017,
+      "learning_rate": 8.405462783251584e-05,
+      "loss": 0.6862,
+      "step": 1764
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.34292050339086494,
+      "learning_rate": 8.395231384100186e-05,
+      "loss": 0.6257,
+      "step": 1765
+    },
+    {
+      "epoch": 0.56512,
+      "grad_norm": 0.3646566164278792,
+      "learning_rate": 8.38500170896068e-05,
+      "loss": 0.634,
+      "step": 1766
+    },
+    {
+      "epoch": 0.56544,
+      "grad_norm": 0.3189982683044711,
+      "learning_rate": 8.374773768822852e-05,
+      "loss": 0.6274,
+      "step": 1767
+    },
+    {
+      "epoch": 0.56576,
+      "grad_norm": 0.3414819172629874,
+      "learning_rate": 8.364547574674646e-05,
+      "loss": 0.6319,
+      "step": 1768
+    },
+    {
+      "epoch": 0.56608,
+      "grad_norm": 0.35254524861186665,
+      "learning_rate": 8.354323137502116e-05,
+      "loss": 0.5951,
+      "step": 1769
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.33019863111545944,
+      "learning_rate": 8.344100468289432e-05,
+      "loss": 0.5548,
+      "step": 1770
+    },
+    {
+      "epoch": 0.56672,
+      "grad_norm": 0.36060299596327494,
+      "learning_rate": 8.33387957801886e-05,
+      "loss": 0.6434,
+      "step": 1771
+    },
+    {
+      "epoch": 0.56704,
+      "grad_norm": 0.32756797011689986,
+      "learning_rate": 8.32366047767077e-05,
+      "loss": 0.5958,
+      "step": 1772
+    },
+    {
+      "epoch": 0.56736,
+      "grad_norm": 0.33393303877597424,
+      "learning_rate": 8.313443178223588e-05,
+      "loss": 0.5528,
+      "step": 1773
+    },
+    {
+      "epoch": 0.56768,
+      "grad_norm": 0.3182382517355006,
+      "learning_rate": 8.303227690653823e-05,
+      "loss": 0.5875,
+      "step": 1774
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.34022211187839557,
+      "learning_rate": 8.293014025936025e-05,
+      "loss": 0.5936,
+      "step": 1775
+    },
+    {
+      "epoch": 0.56832,
+      "grad_norm": 0.35043308249476285,
+      "learning_rate": 8.282802195042791e-05,
+      "loss": 0.6393,
+      "step": 1776
+    },
+    {
+      "epoch": 0.56864,
+      "grad_norm": 0.33363217945429957,
+      "learning_rate": 8.272592208944757e-05,
+      "loss": 0.5875,
+      "step": 1777
+    },
+    {
+      "epoch": 0.56896,
+      "grad_norm": 0.3669529817187083,
+      "learning_rate": 8.262384078610557e-05,
+      "loss": 0.6068,
+      "step": 1778
+    },
+    {
+      "epoch": 0.56928,
+      "grad_norm": 0.33683705702974326,
+      "learning_rate": 8.25217781500685e-05,
+      "loss": 0.5844,
+      "step": 1779
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.33157252838040185,
+      "learning_rate": 8.241973429098278e-05,
+      "loss": 0.6147,
+      "step": 1780
+    },
+    {
+      "epoch": 0.56992,
+      "grad_norm": 0.3601380705008049,
+      "learning_rate": 8.231770931847468e-05,
+      "loss": 0.6203,
+      "step": 1781
+    },
+    {
+      "epoch": 0.57024,
+      "grad_norm": 0.3416796462478041,
+      "learning_rate": 8.221570334215028e-05,
+      "loss": 0.5912,
+      "step": 1782
+    },
+    {
+      "epoch": 0.57056,
+      "grad_norm": 0.3566078336017613,
+      "learning_rate": 8.211371647159508e-05,
+      "loss": 0.6134,
+      "step": 1783
+    },
+    {
+      "epoch": 0.57088,
+      "grad_norm": 0.34778276936001096,
+      "learning_rate": 8.201174881637418e-05,
+      "loss": 0.5982,
+      "step": 1784
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3450000549538375,
+      "learning_rate": 8.190980048603202e-05,
+      "loss": 0.6235,
+      "step": 1785
+    },
+    {
+      "epoch": 0.57152,
+      "grad_norm": 0.31995352297599994,
+      "learning_rate": 8.180787159009224e-05,
+      "loss": 0.5897,
+      "step": 1786
+    },
+    {
+      "epoch": 0.57184,
+      "grad_norm": 0.3476253266014655,
+      "learning_rate": 8.170596223805764e-05,
+      "loss": 0.6175,
+      "step": 1787
+    },
+    {
+      "epoch": 0.57216,
+      "grad_norm": 0.35229632600308786,
+      "learning_rate": 8.160407253940996e-05,
+      "loss": 0.602,
+      "step": 1788
+    },
+    {
+      "epoch": 0.57248,
+      "grad_norm": 0.344557938524413,
+      "learning_rate": 8.15022026036099e-05,
+      "loss": 0.5972,
+      "step": 1789
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3446825091651972,
+      "learning_rate": 8.140035254009694e-05,
+      "loss": 0.5881,
+      "step": 1790
+    },
+    {
+      "epoch": 0.57312,
+      "grad_norm": 0.35506574119094275,
+      "learning_rate": 8.129852245828911e-05,
+      "loss": 0.6082,
+      "step": 1791
+    },
+    {
+      "epoch": 0.57344,
+      "grad_norm": 0.33758397072840335,
+      "learning_rate": 8.119671246758309e-05,
+      "loss": 0.5517,
+      "step": 1792
+    },
+    {
+      "epoch": 0.57376,
+      "grad_norm": 0.3592248986561124,
+      "learning_rate": 8.109492267735385e-05,
+      "loss": 0.6164,
+      "step": 1793
+    },
+    {
+      "epoch": 0.57408,
+      "grad_norm": 0.4102568831719911,
+      "learning_rate": 8.09931531969548e-05,
+      "loss": 0.652,
+      "step": 1794
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.346546650174362,
+      "learning_rate": 8.089140413571747e-05,
+      "loss": 0.6199,
+      "step": 1795
+    },
+    {
+      "epoch": 0.57472,
+      "grad_norm": 0.3417509959458073,
+      "learning_rate": 8.078967560295135e-05,
+      "loss": 0.6402,
+      "step": 1796
+    },
+    {
+      "epoch": 0.57504,
+      "grad_norm": 0.3593332033690645,
+      "learning_rate": 8.068796770794409e-05,
+      "loss": 0.6201,
+      "step": 1797
+    },
+    {
+      "epoch": 0.57536,
+      "grad_norm": 0.3344397831757427,
+      "learning_rate": 8.058628055996097e-05,
+      "loss": 0.5854,
+      "step": 1798
+    },
+    {
+      "epoch": 0.57568,
+      "grad_norm": 0.33669693105473986,
+      "learning_rate": 8.048461426824504e-05,
+      "loss": 0.6071,
+      "step": 1799
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3322098668524984,
+      "learning_rate": 8.038296894201709e-05,
+      "loss": 0.6007,
+      "step": 1800
+    },
+    {
+      "epoch": 0.57632,
+      "grad_norm": 0.36398466596258644,
+      "learning_rate": 8.028134469047511e-05,
+      "loss": 0.6501,
+      "step": 1801
+    },
+    {
+      "epoch": 0.57664,
+      "grad_norm": 0.3817692776522116,
+      "learning_rate": 8.017974162279468e-05,
+      "loss": 0.622,
+      "step": 1802
+    },
+    {
+      "epoch": 0.57696,
+      "grad_norm": 0.3318927429557493,
+      "learning_rate": 8.007815984812858e-05,
+      "loss": 0.613,
+      "step": 1803
+    },
+    {
+      "epoch": 0.57728,
+      "grad_norm": 0.3355035577313009,
+      "learning_rate": 7.997659947560657e-05,
+      "loss": 0.5825,
+      "step": 1804
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.35134403475325027,
+      "learning_rate": 7.987506061433566e-05,
+      "loss": 0.6403,
+      "step": 1805
+    },
+    {
+      "epoch": 0.57792,
+      "grad_norm": 0.34429640725696375,
+      "learning_rate": 7.977354337339947e-05,
+      "loss": 0.6203,
+      "step": 1806
+    },
+    {
+      "epoch": 0.57824,
+      "grad_norm": 0.32812620362219835,
+      "learning_rate": 7.967204786185862e-05,
+      "loss": 0.5783,
+      "step": 1807
+    },
+    {
+      "epoch": 0.57856,
+      "grad_norm": 0.3632573766266116,
+      "learning_rate": 7.957057418875035e-05,
+      "loss": 0.6556,
+      "step": 1808
+    },
+    {
+      "epoch": 0.57888,
+      "grad_norm": 0.3581287231356658,
+      "learning_rate": 7.94691224630883e-05,
+      "loss": 0.574,
+      "step": 1809
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3513606175773462,
+      "learning_rate": 7.93676927938627e-05,
+      "loss": 0.5968,
+      "step": 1810
+    },
+    {
+      "epoch": 0.57952,
+      "grad_norm": 0.3537518775102126,
+      "learning_rate": 7.926628529003993e-05,
+      "loss": 0.6246,
+      "step": 1811
+    },
+    {
+      "epoch": 0.57984,
+      "grad_norm": 0.32630686990256164,
+      "learning_rate": 7.916490006056272e-05,
+      "loss": 0.5991,
+      "step": 1812
+    },
+    {
+      "epoch": 0.58016,
+      "grad_norm": 0.3368490146509299,
+      "learning_rate": 7.906353721434976e-05,
+      "loss": 0.5983,
+      "step": 1813
+    },
+    {
+      "epoch": 0.58048,
+      "grad_norm": 0.3301949361054577,
+      "learning_rate": 7.896219686029568e-05,
+      "loss": 0.6067,
+      "step": 1814
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.3290942403417462,
+      "learning_rate": 7.886087910727102e-05,
+      "loss": 0.5935,
+      "step": 1815
+    },
+    {
+      "epoch": 0.58112,
+      "grad_norm": 0.3768347447878006,
+      "learning_rate": 7.875958406412205e-05,
+      "loss": 0.5981,
+      "step": 1816
+    },
+    {
+      "epoch": 0.58144,
+      "grad_norm": 0.35085799962451825,
+      "learning_rate": 7.865831183967052e-05,
+      "loss": 0.57,
+      "step": 1817
+    },
+    {
+      "epoch": 0.58176,
+      "grad_norm": 0.3362152555262188,
+      "learning_rate": 7.855706254271383e-05,
+      "loss": 0.5961,
+      "step": 1818
+    },
+    {
+      "epoch": 0.58208,
+      "grad_norm": 0.3382900287554013,
+      "learning_rate": 7.845583628202458e-05,
+      "loss": 0.562,
+      "step": 1819
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3640850708981999,
+      "learning_rate": 7.835463316635076e-05,
+      "loss": 0.6246,
+      "step": 1820
+    },
+    {
+      "epoch": 0.58272,
+      "grad_norm": 0.34738804178115684,
+      "learning_rate": 7.825345330441547e-05,
+      "loss": 0.5975,
+      "step": 1821
+    },
+    {
+      "epoch": 0.58304,
+      "grad_norm": 0.3494235111301023,
+      "learning_rate": 7.815229680491672e-05,
+      "loss": 0.6198,
+      "step": 1822
+    },
+    {
+      "epoch": 0.58336,
+      "grad_norm": 0.34502638106532346,
+      "learning_rate": 7.805116377652759e-05,
+      "loss": 0.5754,
+      "step": 1823
+    },
+    {
+      "epoch": 0.58368,
+      "grad_norm": 0.32515833449673626,
+      "learning_rate": 7.795005432789578e-05,
+      "loss": 0.6183,
+      "step": 1824
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3721671465972435,
+      "learning_rate": 7.784896856764378e-05,
+      "loss": 0.6612,
+      "step": 1825
+    },
+    {
+      "epoch": 0.58432,
+      "grad_norm": 0.35012917233931085,
+      "learning_rate": 7.774790660436858e-05,
+      "loss": 0.6351,
+      "step": 1826
+    },
+    {
+      "epoch": 0.58464,
+      "grad_norm": 0.35949715438534474,
+      "learning_rate": 7.76468685466416e-05,
+      "loss": 0.5921,
+      "step": 1827
+    },
+    {
+      "epoch": 0.58496,
+      "grad_norm": 0.3522800868429467,
+      "learning_rate": 7.754585450300857e-05,
+      "loss": 0.5872,
+      "step": 1828
+    },
+    {
+      "epoch": 0.58528,
+      "grad_norm": 0.3535606901082648,
+      "learning_rate": 7.744486458198952e-05,
+      "loss": 0.6495,
+      "step": 1829
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.35126459942610294,
+      "learning_rate": 7.73438988920784e-05,
+      "loss": 0.5455,
+      "step": 1830
+    },
+    {
+      "epoch": 0.58592,
+      "grad_norm": 0.3295892469369936,
+      "learning_rate": 7.724295754174329e-05,
+      "loss": 0.5707,
+      "step": 1831
+    },
+    {
+      "epoch": 0.58624,
+      "grad_norm": 0.355488538883084,
+      "learning_rate": 7.714204063942596e-05,
+      "loss": 0.6039,
+      "step": 1832
+    },
+    {
+      "epoch": 0.58656,
+      "grad_norm": 0.35156084918248576,
+      "learning_rate": 7.704114829354205e-05,
+      "loss": 0.6032,
+      "step": 1833
+    },
+    {
+      "epoch": 0.58688,
+      "grad_norm": 0.332690882577147,
+      "learning_rate": 7.69402806124808e-05,
+      "loss": 0.6067,
+      "step": 1834
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.3662284033432509,
+      "learning_rate": 7.683943770460486e-05,
+      "loss": 0.5819,
+      "step": 1835
+    },
+    {
+      "epoch": 0.58752,
+      "grad_norm": 0.3729064154870074,
+      "learning_rate": 7.67386196782504e-05,
+      "loss": 0.6241,
+      "step": 1836
+    },
+    {
+      "epoch": 0.58784,
+      "grad_norm": 0.35438731610718377,
+      "learning_rate": 7.66378266417267e-05,
+      "loss": 0.6178,
+      "step": 1837
+    },
+    {
+      "epoch": 0.58816,
+      "grad_norm": 0.3445148338757892,
+      "learning_rate": 7.653705870331637e-05,
+      "loss": 0.6472,
+      "step": 1838
+    },
+    {
+      "epoch": 0.58848,
+      "grad_norm": 0.338572057256935,
+      "learning_rate": 7.643631597127491e-05,
+      "loss": 0.5935,
+      "step": 1839
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3743576130506236,
+      "learning_rate": 7.633559855383083e-05,
+      "loss": 0.6354,
+      "step": 1840
+    },
+    {
+      "epoch": 0.58912,
+      "grad_norm": 0.3518943998436618,
+      "learning_rate": 7.623490655918542e-05,
+      "loss": 0.5951,
+      "step": 1841
+    },
+    {
+      "epoch": 0.58944,
+      "grad_norm": 0.3478277490872392,
+      "learning_rate": 7.613424009551262e-05,
+      "loss": 0.6344,
+      "step": 1842
+    },
+    {
+      "epoch": 0.58976,
+      "grad_norm": 0.3804717195769369,
+      "learning_rate": 7.603359927095898e-05,
+      "loss": 0.6052,
+      "step": 1843
+    },
+    {
+      "epoch": 0.59008,
+      "grad_norm": 0.36456673440541854,
+      "learning_rate": 7.593298419364354e-05,
+      "loss": 0.5552,
+      "step": 1844
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3436990292346027,
+      "learning_rate": 7.583239497165758e-05,
+      "loss": 0.5816,
+      "step": 1845
+    },
+    {
+      "epoch": 0.59072,
+      "grad_norm": 0.40387712776280615,
+      "learning_rate": 7.57318317130647e-05,
+      "loss": 0.6242,
+      "step": 1846
+    },
+    {
+      "epoch": 0.59104,
+      "grad_norm": 0.5860322861133728,
+      "learning_rate": 7.563129452590058e-05,
+      "loss": 0.6576,
+      "step": 1847
+    },
+    {
+      "epoch": 0.59136,
+      "grad_norm": 0.33843346953001474,
+      "learning_rate": 7.553078351817284e-05,
+      "loss": 0.604,
+      "step": 1848
+    },
+    {
+      "epoch": 0.59168,
+      "grad_norm": 0.3459407252738132,
+      "learning_rate": 7.54302987978611e-05,
+      "loss": 0.6372,
+      "step": 1849
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3160897855957368,
+      "learning_rate": 7.532984047291653e-05,
+      "loss": 0.5559,
+      "step": 1850
+    },
+    {
+      "epoch": 0.59232,
+      "grad_norm": 0.39557856056846075,
+      "learning_rate": 7.522940865126218e-05,
+      "loss": 0.6034,
+      "step": 1851
+    },
+    {
+      "epoch": 0.59264,
+      "grad_norm": 0.3740328579661679,
+      "learning_rate": 7.512900344079248e-05,
+      "loss": 0.649,
+      "step": 1852
+    },
+    {
+      "epoch": 0.59296,
+      "grad_norm": 0.38069317942855857,
+      "learning_rate": 7.502862494937328e-05,
+      "loss": 0.6618,
+      "step": 1853
+    },
+    {
+      "epoch": 0.59328,
+      "grad_norm": 0.36904855598923475,
+      "learning_rate": 7.49282732848418e-05,
+      "loss": 0.6338,
+      "step": 1854
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.32679068632894726,
+      "learning_rate": 7.482794855500637e-05,
+      "loss": 0.6064,
+      "step": 1855
+    },
+    {
+      "epoch": 0.59392,
+      "grad_norm": 0.33673646719340805,
+      "learning_rate": 7.472765086764636e-05,
+      "loss": 0.5717,
+      "step": 1856
+    },
+    {
+      "epoch": 0.59424,
+      "grad_norm": 0.36930881435386403,
+      "learning_rate": 7.462738033051226e-05,
+      "loss": 0.5982,
+      "step": 1857
+    },
+    {
+      "epoch": 0.59456,
+      "grad_norm": 0.34455375193479976,
+      "learning_rate": 7.452713705132515e-05,
+      "loss": 0.5935,
+      "step": 1858
+    },
+    {
+      "epoch": 0.59488,
+      "grad_norm": 0.3366345570825285,
+      "learning_rate": 7.442692113777698e-05,
+      "loss": 0.5467,
+      "step": 1859
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.34071383754876755,
+      "learning_rate": 7.432673269753033e-05,
+      "loss": 0.6109,
+      "step": 1860
+    },
+    {
+      "epoch": 0.59552,
+      "grad_norm": 0.3355050561880696,
+      "learning_rate": 7.422657183821807e-05,
+      "loss": 0.6128,
+      "step": 1861
+    },
+    {
+      "epoch": 0.59584,
+      "grad_norm": 0.3441311191900993,
+      "learning_rate": 7.41264386674437e-05,
+      "loss": 0.5988,
+      "step": 1862
+    },
+    {
+      "epoch": 0.59616,
+      "grad_norm": 0.35050460869341926,
+      "learning_rate": 7.402633329278077e-05,
+      "loss": 0.562,
+      "step": 1863
+    },
+    {
+      "epoch": 0.59648,
+      "grad_norm": 0.35492535444768214,
+      "learning_rate": 7.392625582177305e-05,
+      "loss": 0.6161,
+      "step": 1864
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.39570849017754073,
+      "learning_rate": 7.382620636193438e-05,
+      "loss": 0.6329,
+      "step": 1865
+    },
+    {
+      "epoch": 0.59712,
+      "grad_norm": 0.3567276679686721,
+      "learning_rate": 7.372618502074839e-05,
+      "loss": 0.6025,
+      "step": 1866
+    },
+    {
+      "epoch": 0.59744,
+      "grad_norm": 0.35229351240295026,
+      "learning_rate": 7.362619190566859e-05,
+      "loss": 0.6299,
+      "step": 1867
+    },
+    {
+      "epoch": 0.59776,
+      "grad_norm": 0.3968985455395164,
+      "learning_rate": 7.352622712411815e-05,
+      "loss": 0.5881,
+      "step": 1868
+    },
+    {
+      "epoch": 0.59808,
+      "grad_norm": 0.3728693452838266,
+      "learning_rate": 7.342629078348975e-05,
+      "loss": 0.6099,
+      "step": 1869
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3265477445107486,
+      "learning_rate": 7.332638299114564e-05,
+      "loss": 0.5906,
+      "step": 1870
+    },
+    {
+      "epoch": 0.59872,
+      "grad_norm": 0.3980449396194788,
+      "learning_rate": 7.322650385441723e-05,
+      "loss": 0.5897,
+      "step": 1871
+    },
+    {
+      "epoch": 0.59904,
+      "grad_norm": 0.441773930417171,
+      "learning_rate": 7.312665348060533e-05,
+      "loss": 0.6003,
+      "step": 1872
+    },
+    {
+      "epoch": 0.59936,
+      "grad_norm": 0.3623260207228797,
+      "learning_rate": 7.302683197697965e-05,
+      "loss": 0.6494,
+      "step": 1873
+    },
+    {
+      "epoch": 0.59968,
+      "grad_norm": 0.3448128798098835,
+      "learning_rate": 7.292703945077903e-05,
+      "loss": 0.551,
+      "step": 1874
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3519105284427964,
+      "learning_rate": 7.28272760092112e-05,
+      "loss": 0.582,
+      "step": 1875
+    },
+    {
+      "epoch": 0.60032,
+      "grad_norm": 0.33984203968889287,
+      "learning_rate": 7.27275417594525e-05,
+      "loss": 0.597,
+      "step": 1876
+    },
+    {
+      "epoch": 0.60064,
+      "grad_norm": 0.3308544795799944,
+      "learning_rate": 7.2627836808648e-05,
+      "loss": 0.5887,
+      "step": 1877
+    },
+    {
+      "epoch": 0.60096,
+      "grad_norm": 0.33063541285678194,
+      "learning_rate": 7.252816126391137e-05,
+      "loss": 0.5823,
+      "step": 1878
+    },
+    {
+      "epoch": 0.60128,
+      "grad_norm": 0.36841842374340583,
+      "learning_rate": 7.242851523232448e-05,
+      "loss": 0.6681,
+      "step": 1879
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3333904385291622,
+      "learning_rate": 7.232889882093774e-05,
+      "loss": 0.5593,
+      "step": 1880
+    },
+    {
+      "epoch": 0.60192,
+      "grad_norm": 0.3394299704793546,
+      "learning_rate": 7.222931213676953e-05,
+      "loss": 0.6011,
+      "step": 1881
+    },
+    {
+      "epoch": 0.60224,
+      "grad_norm": 0.3421787595036792,
+      "learning_rate": 7.212975528680639e-05,
+      "loss": 0.5722,
+      "step": 1882
+    },
+    {
+      "epoch": 0.60256,
+      "grad_norm": 0.3474842414269521,
+      "learning_rate": 7.203022837800286e-05,
+      "loss": 0.6073,
+      "step": 1883
+    },
+    {
+      "epoch": 0.60288,
+      "grad_norm": 0.3393776810551021,
+      "learning_rate": 7.193073151728117e-05,
+      "loss": 0.6158,
+      "step": 1884
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.3512342639452111,
+      "learning_rate": 7.183126481153144e-05,
+      "loss": 0.6121,
+      "step": 1885
+    },
+    {
+      "epoch": 0.60352,
+      "grad_norm": 0.3270891255036287,
+      "learning_rate": 7.173182836761121e-05,
+      "loss": 0.5311,
+      "step": 1886
+    },
+    {
+      "epoch": 0.60384,
+      "grad_norm": 0.3703712729881265,
+      "learning_rate": 7.163242229234569e-05,
+      "loss": 0.5523,
+      "step": 1887
+    },
+    {
+      "epoch": 0.60416,
+      "grad_norm": 0.3558917837259812,
+      "learning_rate": 7.153304669252736e-05,
+      "loss": 0.6299,
+      "step": 1888
+    },
+    {
+      "epoch": 0.60448,
+      "grad_norm": 0.43647224378839683,
+      "learning_rate": 7.143370167491596e-05,
+      "loss": 0.6375,
+      "step": 1889
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3509631190257302,
+      "learning_rate": 7.13343873462384e-05,
+      "loss": 0.6277,
+      "step": 1890
+    },
+    {
+      "epoch": 0.60512,
+      "grad_norm": 0.3818035649517864,
+      "learning_rate": 7.123510381318867e-05,
+      "loss": 0.594,
+      "step": 1891
+    },
+    {
+      "epoch": 0.60544,
+      "grad_norm": 0.33709759866295746,
+      "learning_rate": 7.113585118242754e-05,
+      "loss": 0.6027,
+      "step": 1892
+    },
+    {
+      "epoch": 0.60576,
+      "grad_norm": 0.33019830804874145,
+      "learning_rate": 7.103662956058277e-05,
+      "loss": 0.5849,
+      "step": 1893
+    },
+    {
+      "epoch": 0.60608,
+      "grad_norm": 0.33536534872635776,
+      "learning_rate": 7.09374390542486e-05,
+      "loss": 0.6014,
+      "step": 1894
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.34081409825453657,
+      "learning_rate": 7.083827976998599e-05,
+      "loss": 0.6308,
+      "step": 1895
+    },
+    {
+      "epoch": 0.60672,
+      "grad_norm": 0.3457388818466461,
+      "learning_rate": 7.073915181432233e-05,
+      "loss": 0.5743,
+      "step": 1896
+    },
+    {
+      "epoch": 0.60704,
+      "grad_norm": 0.36262196211565395,
+      "learning_rate": 7.064005529375128e-05,
+      "loss": 0.5803,
+      "step": 1897
+    },
+    {
+      "epoch": 0.60736,
+      "grad_norm": 0.3409727779590013,
+      "learning_rate": 7.054099031473287e-05,
+      "loss": 0.6189,
+      "step": 1898
+    },
+    {
+      "epoch": 0.60768,
+      "grad_norm": 0.337990287180337,
+      "learning_rate": 7.044195698369307e-05,
+      "loss": 0.5831,
+      "step": 1899
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3707543164317421,
+      "learning_rate": 7.034295540702397e-05,
+      "loss": 0.6591,
+      "step": 1900
+    },
+    {
+      "epoch": 0.60832,
+      "grad_norm": 0.35627947450473185,
+      "learning_rate": 7.024398569108359e-05,
+      "loss": 0.6008,
+      "step": 1901
+    },
+    {
+      "epoch": 0.60864,
+      "grad_norm": 0.3582176582922628,
+      "learning_rate": 7.014504794219554e-05,
+      "loss": 0.5976,
+      "step": 1902
+    },
+    {
+      "epoch": 0.60896,
+      "grad_norm": 0.32730720026573235,
+      "learning_rate": 7.004614226664925e-05,
+      "loss": 0.6055,
+      "step": 1903
+    },
+    {
+      "epoch": 0.60928,
+      "grad_norm": 0.37083839654458967,
+      "learning_rate": 6.994726877069968e-05,
+      "loss": 0.561,
+      "step": 1904
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.35244138574927625,
+      "learning_rate": 6.984842756056708e-05,
+      "loss": 0.6148,
+      "step": 1905
+    },
+    {
+      "epoch": 0.60992,
+      "grad_norm": 0.33856137853151946,
+      "learning_rate": 6.974961874243722e-05,
+      "loss": 0.6232,
+      "step": 1906
+    },
+    {
+      "epoch": 0.61024,
+      "grad_norm": 0.3460405354598853,
+      "learning_rate": 6.965084242246088e-05,
+      "loss": 0.58,
+      "step": 1907
+    },
+    {
+      "epoch": 0.61056,
+      "grad_norm": 0.34610336494131433,
+      "learning_rate": 6.955209870675403e-05,
+      "loss": 0.6447,
+      "step": 1908
+    },
+    {
+      "epoch": 0.61088,
+      "grad_norm": 0.3563410678571309,
+      "learning_rate": 6.945338770139764e-05,
+      "loss": 0.6213,
+      "step": 1909
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4954793242972794,
+      "learning_rate": 6.935470951243745e-05,
+      "loss": 0.6023,
+      "step": 1910
+    },
+    {
+      "epoch": 0.61152,
+      "grad_norm": 0.40267918022635496,
+      "learning_rate": 6.925606424588405e-05,
+      "loss": 0.6531,
+      "step": 1911
+    },
+    {
+      "epoch": 0.61184,
+      "grad_norm": 0.32319198390328585,
+      "learning_rate": 6.915745200771248e-05,
+      "loss": 0.5721,
+      "step": 1912
+    },
+    {
+      "epoch": 0.61216,
+      "grad_norm": 0.3885673830240352,
+      "learning_rate": 6.905887290386253e-05,
+      "loss": 0.6086,
+      "step": 1913
+    },
+    {
+      "epoch": 0.61248,
+      "grad_norm": 0.31374233624224723,
+      "learning_rate": 6.896032704023826e-05,
+      "loss": 0.556,
+      "step": 1914
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.35042864174812055,
+      "learning_rate": 6.8861814522708e-05,
+      "loss": 0.624,
+      "step": 1915
+    },
+    {
+      "epoch": 0.61312,
+      "grad_norm": 0.3238677912260108,
+      "learning_rate": 6.876333545710436e-05,
+      "loss": 0.5661,
+      "step": 1916
+    },
+    {
+      "epoch": 0.61344,
+      "grad_norm": 0.360876791391741,
+      "learning_rate": 6.866488994922388e-05,
+      "loss": 0.6589,
+      "step": 1917
+    },
+    {
+      "epoch": 0.61376,
+      "grad_norm": 0.3364148105094434,
+      "learning_rate": 6.856647810482715e-05,
+      "loss": 0.6098,
+      "step": 1918
+    },
+    {
+      "epoch": 0.61408,
+      "grad_norm": 0.33953150057904763,
+      "learning_rate": 6.846810002963862e-05,
+      "loss": 0.5835,
+      "step": 1919
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.33444006812866056,
+      "learning_rate": 6.83697558293463e-05,
+      "loss": 0.6008,
+      "step": 1920
+    },
+    {
+      "epoch": 0.61472,
+      "grad_norm": 0.3450236893827937,
+      "learning_rate": 6.8271445609602e-05,
+      "loss": 0.5528,
+      "step": 1921
+    },
+    {
+      "epoch": 0.61504,
+      "grad_norm": 0.3541289842123425,
+      "learning_rate": 6.81731694760209e-05,
+      "loss": 0.575,
+      "step": 1922
+    },
+    {
+      "epoch": 0.61536,
+      "grad_norm": 0.33859576529313284,
+      "learning_rate": 6.807492753418161e-05,
+      "loss": 0.617,
+      "step": 1923
+    },
+    {
+      "epoch": 0.61568,
+      "grad_norm": 0.31719951883031655,
+      "learning_rate": 6.7976719889626e-05,
+      "loss": 0.5352,
+      "step": 1924
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.3309750549522487,
+      "learning_rate": 6.787854664785906e-05,
+      "loss": 0.5947,
+      "step": 1925
+    },
+    {
+      "epoch": 0.61632,
+      "grad_norm": 0.35567505854354714,
+      "learning_rate": 6.778040791434887e-05,
+      "loss": 0.5997,
+      "step": 1926
+    },
+    {
+      "epoch": 0.61664,
+      "grad_norm": 0.3596150308996054,
+      "learning_rate": 6.768230379452647e-05,
+      "loss": 0.6021,
+      "step": 1927
+    },
+    {
+      "epoch": 0.61696,
+      "grad_norm": 0.378101334854469,
+      "learning_rate": 6.758423439378556e-05,
+      "loss": 0.6678,
+      "step": 1928
+    },
+    {
+      "epoch": 0.61728,
+      "grad_norm": 0.3581054545682819,
+      "learning_rate": 6.748619981748276e-05,
+      "loss": 0.6109,
+      "step": 1929
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3556626346138884,
+      "learning_rate": 6.738820017093706e-05,
+      "loss": 0.5685,
+      "step": 1930
+    },
+    {
+      "epoch": 0.61792,
+      "grad_norm": 0.35201691923831,
+      "learning_rate": 6.729023555943008e-05,
+      "loss": 0.6298,
+      "step": 1931
+    },
+    {
+      "epoch": 0.61824,
+      "grad_norm": 0.35059339899423375,
+      "learning_rate": 6.71923060882058e-05,
+      "loss": 0.5487,
+      "step": 1932
+    },
+    {
+      "epoch": 0.61856,
+      "grad_norm": 0.35013531370272016,
+      "learning_rate": 6.709441186247027e-05,
+      "loss": 0.6421,
+      "step": 1933
+    },
+    {
+      "epoch": 0.61888,
+      "grad_norm": 0.33988654085275827,
+      "learning_rate": 6.699655298739191e-05,
+      "loss": 0.5705,
+      "step": 1934
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.37528410622289504,
+      "learning_rate": 6.689872956810102e-05,
+      "loss": 0.6252,
+      "step": 1935
+    },
+    {
+      "epoch": 0.61952,
+      "grad_norm": 0.3640375305941058,
+      "learning_rate": 6.680094170968984e-05,
+      "loss": 0.5609,
+      "step": 1936
+    },
+    {
+      "epoch": 0.61984,
+      "grad_norm": 0.34663352050728613,
+      "learning_rate": 6.670318951721244e-05,
+      "loss": 0.6163,
+      "step": 1937
+    },
+    {
+      "epoch": 0.62016,
+      "grad_norm": 0.3362295699160863,
+      "learning_rate": 6.660547309568453e-05,
+      "loss": 0.6443,
+      "step": 1938
+    },
+    {
+      "epoch": 0.62048,
+      "grad_norm": 0.47610771558551523,
+      "learning_rate": 6.650779255008335e-05,
+      "loss": 0.6535,
+      "step": 1939
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.33397382452848845,
+      "learning_rate": 6.641014798534777e-05,
+      "loss": 0.633,
+      "step": 1940
+    },
+    {
+      "epoch": 0.62112,
+      "grad_norm": 0.3480349224542998,
+      "learning_rate": 6.631253950637779e-05,
+      "loss": 0.5857,
+      "step": 1941
+    },
+    {
+      "epoch": 0.62144,
+      "grad_norm": 0.330826962424729,
+      "learning_rate": 6.621496721803482e-05,
+      "loss": 0.587,
+      "step": 1942
+    },
+    {
+      "epoch": 0.62176,
+      "grad_norm": 0.3331030000813696,
+      "learning_rate": 6.611743122514125e-05,
+      "loss": 0.576,
+      "step": 1943
+    },
+    {
+      "epoch": 0.62208,
+      "grad_norm": 0.333396476832253,
+      "learning_rate": 6.601993163248056e-05,
+      "loss": 0.6313,
+      "step": 1944
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3498400431441658,
+      "learning_rate": 6.592246854479716e-05,
+      "loss": 0.6292,
+      "step": 1945
+    },
+    {
+      "epoch": 0.62272,
+      "grad_norm": 0.38921573964876904,
+      "learning_rate": 6.582504206679612e-05,
+      "loss": 0.6416,
+      "step": 1946
+    },
+    {
+      "epoch": 0.62304,
+      "grad_norm": 0.34675016742971393,
+      "learning_rate": 6.57276523031433e-05,
+      "loss": 0.6093,
+      "step": 1947
+    },
+    {
+      "epoch": 0.62336,
+      "grad_norm": 0.34304511632347195,
+      "learning_rate": 6.563029935846501e-05,
+      "loss": 0.6031,
+      "step": 1948
+    },
+    {
+      "epoch": 0.62368,
+      "grad_norm": 0.35691656699875085,
+      "learning_rate": 6.553298333734812e-05,
+      "loss": 0.648,
+      "step": 1949
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.33030219390031523,
+      "learning_rate": 6.543570434433974e-05,
+      "loss": 0.5982,
+      "step": 1950
+    },
+    {
+      "epoch": 0.62432,
+      "grad_norm": 0.35822092901925984,
+      "learning_rate": 6.533846248394726e-05,
+      "loss": 0.5655,
+      "step": 1951
+    },
+    {
+      "epoch": 0.62464,
+      "grad_norm": 0.33147031143132805,
+      "learning_rate": 6.524125786063812e-05,
+      "loss": 0.6276,
+      "step": 1952
+    },
+    {
+      "epoch": 0.62496,
+      "grad_norm": 0.3505576625422144,
+      "learning_rate": 6.514409057883985e-05,
+      "loss": 0.5837,
+      "step": 1953
+    },
+    {
+      "epoch": 0.62528,
+      "grad_norm": 0.33463326459661547,
+      "learning_rate": 6.504696074293973e-05,
+      "loss": 0.6308,
+      "step": 1954
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.3479494892905318,
+      "learning_rate": 6.494986845728495e-05,
+      "loss": 0.6095,
+      "step": 1955
+    },
+    {
+      "epoch": 0.62592,
+      "grad_norm": 0.3350495826604109,
+      "learning_rate": 6.485281382618222e-05,
+      "loss": 0.5712,
+      "step": 1956
+    },
+    {
+      "epoch": 0.62624,
+      "grad_norm": 0.3455116328868799,
+      "learning_rate": 6.475579695389793e-05,
+      "loss": 0.6226,
+      "step": 1957
+    },
+    {
+      "epoch": 0.62656,
+      "grad_norm": 0.34347161806365567,
+      "learning_rate": 6.465881794465786e-05,
+      "loss": 0.6437,
+      "step": 1958
+    },
+    {
+      "epoch": 0.62688,
+      "grad_norm": 0.333188456743107,
+      "learning_rate": 6.456187690264705e-05,
+      "loss": 0.5938,
+      "step": 1959
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.34214147323845223,
+      "learning_rate": 6.446497393200985e-05,
+      "loss": 0.6036,
+      "step": 1960
+    },
+    {
+      "epoch": 0.62752,
+      "grad_norm": 0.37549074299350527,
+      "learning_rate": 6.436810913684963e-05,
+      "loss": 0.6248,
+      "step": 1961
+    },
+    {
+      "epoch": 0.62784,
+      "grad_norm": 0.3768302593197824,
+      "learning_rate": 6.427128262122877e-05,
+      "loss": 0.5912,
+      "step": 1962
+    },
+    {
+      "epoch": 0.62816,
+      "grad_norm": 0.357635947655499,
+      "learning_rate": 6.41744944891686e-05,
+      "loss": 0.5933,
+      "step": 1963
+    },
+    {
+      "epoch": 0.62848,
+      "grad_norm": 0.3902771304039085,
+      "learning_rate": 6.40777448446491e-05,
+      "loss": 0.5732,
+      "step": 1964
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.32872369588529426,
+      "learning_rate": 6.398103379160894e-05,
+      "loss": 0.604,
+      "step": 1965
+    },
+    {
+      "epoch": 0.62912,
+      "grad_norm": 0.344510616958961,
+      "learning_rate": 6.38843614339454e-05,
+      "loss": 0.6127,
+      "step": 1966
+    },
+    {
+      "epoch": 0.62944,
+      "grad_norm": 0.33791665167147994,
+      "learning_rate": 6.378772787551406e-05,
+      "loss": 0.6079,
+      "step": 1967
+    },
+    {
+      "epoch": 0.62976,
+      "grad_norm": 0.3644273641741268,
+      "learning_rate": 6.369113322012898e-05,
+      "loss": 0.6121,
+      "step": 1968
+    },
+    {
+      "epoch": 0.63008,
+      "grad_norm": 0.3471073608458236,
+      "learning_rate": 6.359457757156225e-05,
+      "loss": 0.6552,
+      "step": 1969
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.33374082385825,
+      "learning_rate": 6.349806103354417e-05,
+      "loss": 0.5831,
+      "step": 1970
+    },
+    {
+      "epoch": 0.63072,
+      "grad_norm": 0.334394370938585,
+      "learning_rate": 6.340158370976306e-05,
+      "loss": 0.6162,
+      "step": 1971
+    },
+    {
+      "epoch": 0.63104,
+      "grad_norm": 0.3800632214695881,
+      "learning_rate": 6.330514570386495e-05,
+      "loss": 0.5994,
+      "step": 1972
+    },
+    {
+      "epoch": 0.63136,
+      "grad_norm": 0.3599124750253699,
+      "learning_rate": 6.320874711945382e-05,
+      "loss": 0.643,
+      "step": 1973
+    },
+    {
+      "epoch": 0.63168,
+      "grad_norm": 0.3223482280689595,
+      "learning_rate": 6.311238806009112e-05,
+      "loss": 0.5583,
+      "step": 1974
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.3569872180637835,
+      "learning_rate": 6.301606862929599e-05,
+      "loss": 0.6399,
+      "step": 1975
+    },
+    {
+      "epoch": 0.63232,
+      "grad_norm": 0.4037901641847413,
+      "learning_rate": 6.291978893054493e-05,
+      "loss": 0.5964,
+      "step": 1976
+    },
+    {
+      "epoch": 0.63264,
+      "grad_norm": 0.3777917756077199,
+      "learning_rate": 6.28235490672717e-05,
+      "loss": 0.6511,
+      "step": 1977
+    },
+    {
+      "epoch": 0.63296,
+      "grad_norm": 0.3351399149280321,
+      "learning_rate": 6.272734914286738e-05,
+      "loss": 0.6158,
+      "step": 1978
+    },
+    {
+      "epoch": 0.63328,
+      "grad_norm": 0.42345018526463113,
+      "learning_rate": 6.26311892606801e-05,
+      "loss": 0.6283,
+      "step": 1979
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.35112881188089906,
+      "learning_rate": 6.253506952401486e-05,
+      "loss": 0.5964,
+      "step": 1980
+    },
+    {
+      "epoch": 0.63392,
+      "grad_norm": 0.3509198575726522,
+      "learning_rate": 6.243899003613378e-05,
+      "loss": 0.6295,
+      "step": 1981
+    },
+    {
+      "epoch": 0.63424,
+      "grad_norm": 0.34702519733189763,
+      "learning_rate": 6.234295090025543e-05,
+      "loss": 0.6483,
+      "step": 1982
+    },
+    {
+      "epoch": 0.63456,
+      "grad_norm": 0.3365075547843539,
+      "learning_rate": 6.224695221955528e-05,
+      "loss": 0.5985,
+      "step": 1983
+    },
+    {
+      "epoch": 0.63488,
+      "grad_norm": 0.3427907029581174,
+      "learning_rate": 6.215099409716527e-05,
+      "loss": 0.5935,
+      "step": 1984
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3898604857758913,
+      "learning_rate": 6.205507663617369e-05,
+      "loss": 0.594,
+      "step": 1985
+    },
+    {
+      "epoch": 0.63552,
+      "grad_norm": 0.36899032854667785,
+      "learning_rate": 6.195919993962526e-05,
+      "loss": 0.59,
+      "step": 1986
+    },
+    {
+      "epoch": 0.63584,
+      "grad_norm": 0.3647593849049401,
+      "learning_rate": 6.186336411052076e-05,
+      "loss": 0.6184,
+      "step": 1987
+    },
+    {
+      "epoch": 0.63616,
+      "grad_norm": 0.36788914135648504,
+      "learning_rate": 6.176756925181724e-05,
+      "loss": 0.5693,
+      "step": 1988
+    },
+    {
+      "epoch": 0.63648,
+      "grad_norm": 0.35888830905427066,
+      "learning_rate": 6.167181546642765e-05,
+      "loss": 0.5862,
+      "step": 1989
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3916968509376385,
+      "learning_rate": 6.157610285722075e-05,
+      "loss": 0.6235,
+      "step": 1990
+    },
+    {
+      "epoch": 0.63712,
+      "grad_norm": 0.36725234260748274,
+      "learning_rate": 6.148043152702123e-05,
+      "loss": 0.5974,
+      "step": 1991
+    },
+    {
+      "epoch": 0.63744,
+      "grad_norm": 0.3528027520142461,
+      "learning_rate": 6.138480157860921e-05,
+      "loss": 0.5824,
+      "step": 1992
+    },
+    {
+      "epoch": 0.63776,
+      "grad_norm": 0.33292930868441833,
+      "learning_rate": 6.12892131147206e-05,
+      "loss": 0.5512,
+      "step": 1993
+    },
+    {
+      "epoch": 0.63808,
+      "grad_norm": 0.34146409838349157,
+      "learning_rate": 6.119366623804657e-05,
+      "loss": 0.5762,
+      "step": 1994
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.37487891872563855,
+      "learning_rate": 6.109816105123362e-05,
+      "loss": 0.6288,
+      "step": 1995
+    },
+    {
+      "epoch": 0.63872,
+      "grad_norm": 0.34305373560782226,
+      "learning_rate": 6.1002697656883534e-05,
+      "loss": 0.6155,
+      "step": 1996
+    },
+    {
+      "epoch": 0.63904,
+      "grad_norm": 0.35759278621831475,
+      "learning_rate": 6.090727615755323e-05,
+      "loss": 0.6356,
+      "step": 1997
+    },
+    {
+      "epoch": 0.63936,
+      "grad_norm": 0.34842299986583225,
+      "learning_rate": 6.0811896655754465e-05,
+      "loss": 0.5795,
+      "step": 1998
+    },
+    {
+      "epoch": 0.63968,
+      "grad_norm": 0.3542660813580204,
+      "learning_rate": 6.0716559253954066e-05,
+      "loss": 0.5973,
+      "step": 1999
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.37031274294214567,
+      "learning_rate": 6.0621264054573435e-05,
+      "loss": 0.6216,
+      "step": 2000
+    },
+    {
+      "epoch": 0.64032,
+      "grad_norm": 0.354375054376305,
+      "learning_rate": 6.052601115998878e-05,
+      "loss": 0.612,
+      "step": 2001
+    },
+    {
+      "epoch": 0.64064,
+      "grad_norm": 0.34356793991137885,
+      "learning_rate": 6.0430800672530876e-05,
+      "loss": 0.5865,
+      "step": 2002
+    },
+    {
+      "epoch": 0.64096,
+      "grad_norm": 0.34524776507829036,
+      "learning_rate": 6.0335632694484786e-05,
+      "loss": 0.566,
+      "step": 2003
+    },
+    {
+      "epoch": 0.64128,
+      "grad_norm": 0.3490977700367447,
+      "learning_rate": 6.024050732809008e-05,
+      "loss": 0.5903,
+      "step": 2004
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.323204570629607,
+      "learning_rate": 6.0145424675540394e-05,
+      "loss": 0.5869,
+      "step": 2005
+    },
+    {
+      "epoch": 0.64192,
+      "grad_norm": 0.3839359250539005,
+      "learning_rate": 6.005038483898362e-05,
+      "loss": 0.6245,
+      "step": 2006
+    },
+    {
+      "epoch": 0.64224,
+      "grad_norm": 0.36390599085911796,
+      "learning_rate": 5.9955387920521556e-05,
+      "loss": 0.6403,
+      "step": 2007
+    },
+    {
+      "epoch": 0.64256,
+      "grad_norm": 0.3506411929622666,
+      "learning_rate": 5.986043402220991e-05,
+      "loss": 0.6148,
+      "step": 2008
+    },
+    {
+      "epoch": 0.64288,
+      "grad_norm": 0.36104334103629493,
+      "learning_rate": 5.97655232460582e-05,
+      "loss": 0.5755,
+      "step": 2009
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.36005742059183676,
+      "learning_rate": 5.967065569402963e-05,
+      "loss": 0.6218,
+      "step": 2010
+    },
+    {
+      "epoch": 0.64352,
+      "grad_norm": 0.3401721909877025,
+      "learning_rate": 5.957583146804089e-05,
+      "loss": 0.5984,
+      "step": 2011
+    },
+    {
+      "epoch": 0.64384,
+      "grad_norm": 0.3494695725479328,
+      "learning_rate": 5.948105066996221e-05,
+      "loss": 0.6064,
+      "step": 2012
+    },
+    {
+      "epoch": 0.64416,
+      "grad_norm": 0.3554958485339228,
+      "learning_rate": 5.938631340161711e-05,
+      "loss": 0.6191,
+      "step": 2013
+    },
+    {
+      "epoch": 0.64448,
+      "grad_norm": 0.37812082673335734,
+      "learning_rate": 5.929161976478237e-05,
+      "loss": 0.6146,
+      "step": 2014
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3504930724082363,
+      "learning_rate": 5.919696986118792e-05,
+      "loss": 0.6244,
+      "step": 2015
+    },
+    {
+      "epoch": 0.64512,
+      "grad_norm": 0.34501863572185304,
+      "learning_rate": 5.910236379251664e-05,
+      "loss": 0.6019,
+      "step": 2016
+    },
+    {
+      "epoch": 0.64544,
+      "grad_norm": 0.347046718981164,
+      "learning_rate": 5.9007801660404406e-05,
+      "loss": 0.6037,
+      "step": 2017
+    },
+    {
+      "epoch": 0.64576,
+      "grad_norm": 0.334516084814039,
+      "learning_rate": 5.891328356643979e-05,
+      "loss": 0.5849,
+      "step": 2018
+    },
+    {
+      "epoch": 0.64608,
+      "grad_norm": 0.3486600052010709,
+      "learning_rate": 5.881880961216415e-05,
+      "loss": 0.5822,
+      "step": 2019
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.37517557288961817,
+      "learning_rate": 5.872437989907136e-05,
+      "loss": 0.6097,
+      "step": 2020
+    },
+    {
+      "epoch": 0.64672,
+      "grad_norm": 0.35633671254185595,
+      "learning_rate": 5.86299945286078e-05,
+      "loss": 0.6159,
+      "step": 2021
+    },
+    {
+      "epoch": 0.64704,
+      "grad_norm": 0.33265969768336545,
+      "learning_rate": 5.85356536021722e-05,
+      "loss": 0.5683,
+      "step": 2022
+    },
+    {
+      "epoch": 0.64736,
+      "grad_norm": 0.3455329694067352,
+      "learning_rate": 5.844135722111555e-05,
+      "loss": 0.5739,
+      "step": 2023
+    },
+    {
+      "epoch": 0.64768,
+      "grad_norm": 0.35329932653586066,
+      "learning_rate": 5.8347105486740906e-05,
+      "loss": 0.5872,
+      "step": 2024
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.36760192003971637,
+      "learning_rate": 5.8252898500303575e-05,
+      "loss": 0.6332,
+      "step": 2025
+    },
+    {
+      "epoch": 0.64832,
+      "grad_norm": 0.3529618057725791,
+      "learning_rate": 5.8158736363010526e-05,
+      "loss": 0.5883,
+      "step": 2026
+    },
+    {
+      "epoch": 0.64864,
+      "grad_norm": 0.3567232711947976,
+      "learning_rate": 5.806461917602074e-05,
+      "loss": 0.5695,
+      "step": 2027
+    },
+    {
+      "epoch": 0.64896,
+      "grad_norm": 0.32973668181108357,
+      "learning_rate": 5.7970547040444826e-05,
+      "loss": 0.6436,
+      "step": 2028
+    },
+    {
+      "epoch": 0.64928,
+      "grad_norm": 0.36989591043910014,
+      "learning_rate": 5.787652005734494e-05,
+      "loss": 0.6,
+      "step": 2029
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3512108851293808,
+      "learning_rate": 5.7782538327734884e-05,
+      "loss": 0.5723,
+      "step": 2030
+    },
+    {
+      "epoch": 0.64992,
+      "grad_norm": 0.36034410354167296,
+      "learning_rate": 5.768860195257968e-05,
+      "loss": 0.6217,
+      "step": 2031
+    },
+    {
+      "epoch": 0.65024,
+      "grad_norm": 0.3634043706264434,
+      "learning_rate": 5.7594711032795736e-05,
+      "loss": 0.6413,
+      "step": 2032
+    },
+    {
+      "epoch": 0.65056,
+      "grad_norm": 0.34696679449036355,
+      "learning_rate": 5.7500865669250626e-05,
+      "loss": 0.6035,
+      "step": 2033
+    },
+    {
+      "epoch": 0.65088,
+      "grad_norm": 0.36921144484327034,
+      "learning_rate": 5.7407065962762875e-05,
+      "loss": 0.638,
+      "step": 2034
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3604245985171455,
+      "learning_rate": 5.731331201410211e-05,
+      "loss": 0.6358,
+      "step": 2035
+    },
+    {
+      "epoch": 0.65152,
+      "grad_norm": 0.33336876492518613,
+      "learning_rate": 5.721960392398864e-05,
+      "loss": 0.5855,
+      "step": 2036
+    },
+    {
+      "epoch": 0.65184,
+      "grad_norm": 0.35765372253095246,
+      "learning_rate": 5.712594179309363e-05,
+      "loss": 0.5929,
+      "step": 2037
+    },
+    {
+      "epoch": 0.65216,
+      "grad_norm": 0.3225003554617414,
+      "learning_rate": 5.703232572203887e-05,
+      "loss": 0.6205,
+      "step": 2038
+    },
+    {
+      "epoch": 0.65248,
+      "grad_norm": 0.3435914377123781,
+      "learning_rate": 5.693875581139656e-05,
+      "loss": 0.5663,
+      "step": 2039
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3528038112643403,
+      "learning_rate": 5.68452321616894e-05,
+      "loss": 0.695,
+      "step": 2040
+    },
+    {
+      "epoch": 0.65312,
+      "grad_norm": 0.34626246994151005,
+      "learning_rate": 5.675175487339042e-05,
+      "loss": 0.6674,
+      "step": 2041
+    },
+    {
+      "epoch": 0.65344,
+      "grad_norm": 0.35168272875291506,
+      "learning_rate": 5.66583240469227e-05,
+      "loss": 0.6012,
+      "step": 2042
+    },
+    {
+      "epoch": 0.65376,
+      "grad_norm": 0.3549995660404429,
+      "learning_rate": 5.65649397826596e-05,
+      "loss": 0.6469,
+      "step": 2043
+    },
+    {
+      "epoch": 0.65408,
+      "grad_norm": 0.34065114536047747,
+      "learning_rate": 5.647160218092424e-05,
+      "loss": 0.5899,
+      "step": 2044
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.343485285841103,
+      "learning_rate": 5.637831134198982e-05,
+      "loss": 0.6092,
+      "step": 2045
+    },
+    {
+      "epoch": 0.65472,
+      "grad_norm": 0.3442168880757106,
+      "learning_rate": 5.6285067366079214e-05,
+      "loss": 0.6046,
+      "step": 2046
+    },
+    {
+      "epoch": 0.65504,
+      "grad_norm": 0.314384420811312,
+      "learning_rate": 5.6191870353364864e-05,
+      "loss": 0.5792,
+      "step": 2047
+    },
+    {
+      "epoch": 0.65536,
+      "grad_norm": 0.3546328917788084,
+      "learning_rate": 5.609872040396894e-05,
+      "loss": 0.5736,
+      "step": 2048
+    },
+    {
+      "epoch": 0.65568,
+      "grad_norm": 0.3309075071145526,
+      "learning_rate": 5.600561761796286e-05,
+      "loss": 0.5955,
+      "step": 2049
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3464703362931646,
+      "learning_rate": 5.5912562095367514e-05,
+      "loss": 0.6404,
+      "step": 2050
+    },
+    {
+      "epoch": 0.65632,
+      "grad_norm": 0.37007410033329685,
+      "learning_rate": 5.5819553936153e-05,
+      "loss": 0.5883,
+      "step": 2051
+    },
+    {
+      "epoch": 0.65664,
+      "grad_norm": 0.35701392801641474,
+      "learning_rate": 5.5726593240238436e-05,
+      "loss": 0.5559,
+      "step": 2052
+    },
+    {
+      "epoch": 0.65696,
+      "grad_norm": 0.34402146381815724,
+      "learning_rate": 5.563368010749208e-05,
+      "loss": 0.5914,
+      "step": 2053
+    },
+    {
+      "epoch": 0.65728,
+      "grad_norm": 0.332056145194669,
+      "learning_rate": 5.554081463773098e-05,
+      "loss": 0.5983,
+      "step": 2054
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.34100466230285387,
+      "learning_rate": 5.544799693072107e-05,
+      "loss": 0.5451,
+      "step": 2055
+    },
+    {
+      "epoch": 0.65792,
+      "grad_norm": 0.3560960301887002,
+      "learning_rate": 5.535522708617686e-05,
+      "loss": 0.6306,
+      "step": 2056
+    },
+    {
+      "epoch": 0.65824,
+      "grad_norm": 0.3403216703658889,
+      "learning_rate": 5.5262505203761624e-05,
+      "loss": 0.6284,
+      "step": 2057
+    },
+    {
+      "epoch": 0.65856,
+      "grad_norm": 0.3366642146627937,
+      "learning_rate": 5.516983138308689e-05,
+      "loss": 0.5765,
+      "step": 2058
+    },
+    {
+      "epoch": 0.65888,
+      "grad_norm": 0.3502813843702922,
+      "learning_rate": 5.5077205723712745e-05,
+      "loss": 0.614,
+      "step": 2059
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.36374933081863037,
+      "learning_rate": 5.498462832514737e-05,
+      "loss": 0.6305,
+      "step": 2060
+    },
+    {
+      "epoch": 0.65952,
+      "grad_norm": 0.36312266218067474,
+      "learning_rate": 5.4892099286847274e-05,
+      "loss": 0.5706,
+      "step": 2061
+    },
+    {
+      "epoch": 0.65984,
+      "grad_norm": 0.3431109901545333,
+      "learning_rate": 5.479961870821683e-05,
+      "loss": 0.5773,
+      "step": 2062
+    },
+    {
+      "epoch": 0.66016,
+      "grad_norm": 0.346553441248706,
+      "learning_rate": 5.470718668860848e-05,
+      "loss": 0.5697,
+      "step": 2063
+    },
+    {
+      "epoch": 0.66048,
+      "grad_norm": 0.3350030065506168,
+      "learning_rate": 5.461480332732249e-05,
+      "loss": 0.6016,
+      "step": 2064
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.34105759765938687,
+      "learning_rate": 5.4522468723606766e-05,
+      "loss": 0.5919,
+      "step": 2065
+    },
+    {
+      "epoch": 0.66112,
+      "grad_norm": 0.34790499613377257,
+      "learning_rate": 5.4430182976656944e-05,
+      "loss": 0.6302,
+      "step": 2066
+    },
+    {
+      "epoch": 0.66144,
+      "grad_norm": 0.3197844662148609,
+      "learning_rate": 5.433794618561605e-05,
+      "loss": 0.5738,
+      "step": 2067
+    },
+    {
+      "epoch": 0.66176,
+      "grad_norm": 0.3824322801478392,
+      "learning_rate": 5.424575844957462e-05,
+      "loss": 0.5626,
+      "step": 2068
+    },
+    {
+      "epoch": 0.66208,
+      "grad_norm": 0.35418669909269734,
+      "learning_rate": 5.41536198675705e-05,
+      "loss": 0.5816,
+      "step": 2069
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.35688396732221106,
+      "learning_rate": 5.40615305385886e-05,
+      "loss": 0.6281,
+      "step": 2070
+    },
+    {
+      "epoch": 0.66272,
+      "grad_norm": 0.3121624609259152,
+      "learning_rate": 5.396949056156104e-05,
+      "loss": 0.5537,
+      "step": 2071
+    },
+    {
+      "epoch": 0.66304,
+      "grad_norm": 0.3397661727705769,
+      "learning_rate": 5.387750003536691e-05,
+      "loss": 0.5835,
+      "step": 2072
+    },
+    {
+      "epoch": 0.66336,
+      "grad_norm": 0.3582237880936574,
+      "learning_rate": 5.378555905883209e-05,
+      "loss": 0.6073,
+      "step": 2073
+    },
+    {
+      "epoch": 0.66368,
+      "grad_norm": 0.35398878554064056,
+      "learning_rate": 5.369366773072935e-05,
+      "loss": 0.5629,
+      "step": 2074
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.41663713321345,
+      "learning_rate": 5.3601826149777966e-05,
+      "loss": 0.6176,
+      "step": 2075
+    },
+    {
+      "epoch": 0.66432,
+      "grad_norm": 0.45132563293139605,
+      "learning_rate": 5.3510034414643926e-05,
+      "loss": 0.6003,
+      "step": 2076
+    },
+    {
+      "epoch": 0.66464,
+      "grad_norm": 0.3487035577840251,
+      "learning_rate": 5.341829262393962e-05,
+      "loss": 0.614,
+      "step": 2077
+    },
+    {
+      "epoch": 0.66496,
+      "grad_norm": 0.346383983874882,
+      "learning_rate": 5.33266008762237e-05,
+      "loss": 0.5896,
+      "step": 2078
+    },
+    {
+      "epoch": 0.66528,
+      "grad_norm": 0.3498860534467345,
+      "learning_rate": 5.323495927000121e-05,
+      "loss": 0.5743,
+      "step": 2079
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.32260245557893713,
+      "learning_rate": 5.314336790372314e-05,
+      "loss": 0.6027,
+      "step": 2080
+    },
+    {
+      "epoch": 0.66592,
+      "grad_norm": 0.3525826953611603,
+      "learning_rate": 5.305182687578669e-05,
+      "loss": 0.6021,
+      "step": 2081
+    },
+    {
+      "epoch": 0.66624,
+      "grad_norm": 0.34509038475476195,
+      "learning_rate": 5.296033628453484e-05,
+      "loss": 0.5911,
+      "step": 2082
+    },
+    {
+      "epoch": 0.66656,
+      "grad_norm": 0.3451670125169388,
+      "learning_rate": 5.28688962282565e-05,
+      "loss": 0.6002,
+      "step": 2083
+    },
+    {
+      "epoch": 0.66688,
+      "grad_norm": 0.3438255109041832,
+      "learning_rate": 5.277750680518616e-05,
+      "loss": 0.6187,
+      "step": 2084
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4060436564166929,
+      "learning_rate": 5.2686168113504065e-05,
+      "loss": 0.6208,
+      "step": 2085
+    },
+    {
+      "epoch": 0.66752,
+      "grad_norm": 0.3526091690642306,
+      "learning_rate": 5.259488025133581e-05,
+      "loss": 0.6036,
+      "step": 2086
+    },
+    {
+      "epoch": 0.66784,
+      "grad_norm": 0.3399029864410244,
+      "learning_rate": 5.2503643316752525e-05,
+      "loss": 0.5794,
+      "step": 2087
+    },
+    {
+      "epoch": 0.66816,
+      "grad_norm": 0.3793844190152782,
+      "learning_rate": 5.241245740777048e-05,
+      "loss": 0.5522,
+      "step": 2088
+    },
+    {
+      "epoch": 0.66848,
+      "grad_norm": 0.335563960813538,
+      "learning_rate": 5.2321322622351254e-05,
+      "loss": 0.5831,
+      "step": 2089
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3644425321825224,
+      "learning_rate": 5.22302390584015e-05,
+      "loss": 0.6042,
+      "step": 2090
+    },
+    {
+      "epoch": 0.66912,
+      "grad_norm": 0.3451372037548831,
+      "learning_rate": 5.213920681377269e-05,
+      "loss": 0.6461,
+      "step": 2091
+    },
+    {
+      "epoch": 0.66944,
+      "grad_norm": 0.35458415976060564,
+      "learning_rate": 5.20482259862614e-05,
+      "loss": 0.5987,
+      "step": 2092
+    },
+    {
+      "epoch": 0.66976,
+      "grad_norm": 0.3343760722253775,
+      "learning_rate": 5.195729667360871e-05,
+      "loss": 0.5951,
+      "step": 2093
+    },
+    {
+      "epoch": 0.67008,
+      "grad_norm": 0.32043943548255815,
+      "learning_rate": 5.1866418973500575e-05,
+      "loss": 0.5356,
+      "step": 2094
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.35326541827919694,
+      "learning_rate": 5.1775592983567426e-05,
+      "loss": 0.6788,
+      "step": 2095
+    },
+    {
+      "epoch": 0.67072,
+      "grad_norm": 0.34535253136486044,
+      "learning_rate": 5.168481880138405e-05,
+      "loss": 0.5929,
+      "step": 2096
+    },
+    {
+      "epoch": 0.67104,
+      "grad_norm": 0.3491851474045318,
+      "learning_rate": 5.159409652446976e-05,
+      "loss": 0.6167,
+      "step": 2097
+    },
+    {
+      "epoch": 0.67136,
+      "grad_norm": 0.35441336377883786,
+      "learning_rate": 5.15034262502879e-05,
+      "loss": 0.5981,
+      "step": 2098
+    },
+    {
+      "epoch": 0.67168,
+      "grad_norm": 0.338926333841282,
+      "learning_rate": 5.1412808076246123e-05,
+      "loss": 0.5557,
+      "step": 2099
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3196784137658592,
+      "learning_rate": 5.132224209969605e-05,
+      "loss": 0.5487,
+      "step": 2100
+    },
+    {
+      "epoch": 0.67232,
+      "grad_norm": 0.34804145701447764,
+      "learning_rate": 5.123172841793315e-05,
+      "loss": 0.5714,
+      "step": 2101
+    },
+    {
+      "epoch": 0.67264,
+      "grad_norm": 0.37430255623358516,
+      "learning_rate": 5.1141267128196804e-05,
+      "loss": 0.6019,
+      "step": 2102
+    },
+    {
+      "epoch": 0.67296,
+      "grad_norm": 0.33396059911421894,
+      "learning_rate": 5.1050858327670136e-05,
+      "loss": 0.5526,
+      "step": 2103
+    },
+    {
+      "epoch": 0.67328,
+      "grad_norm": 0.34771881718642605,
+      "learning_rate": 5.096050211347975e-05,
+      "loss": 0.577,
+      "step": 2104
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.3362442533689697,
+      "learning_rate": 5.087019858269588e-05,
+      "loss": 0.617,
+      "step": 2105
+    },
+    {
+      "epoch": 0.67392,
+      "grad_norm": 0.3726473249121704,
+      "learning_rate": 5.0779947832332074e-05,
+      "loss": 0.6207,
+      "step": 2106
+    },
+    {
+      "epoch": 0.67424,
+      "grad_norm": 0.33868489525988194,
+      "learning_rate": 5.068974995934523e-05,
+      "loss": 0.5762,
+      "step": 2107
+    },
+    {
+      "epoch": 0.67456,
+      "grad_norm": 0.34236111482465825,
+      "learning_rate": 5.059960506063548e-05,
+      "loss": 0.6233,
+      "step": 2108
+    },
+    {
+      "epoch": 0.67488,
+      "grad_norm": 0.36793989757759804,
+      "learning_rate": 5.05095132330459e-05,
+      "loss": 0.6449,
+      "step": 2109
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3468855130341154,
+      "learning_rate": 5.041947457336274e-05,
+      "loss": 0.5863,
+      "step": 2110
+    },
+    {
+      "epoch": 0.67552,
+      "grad_norm": 0.3463551120740465,
+      "learning_rate": 5.0329489178314974e-05,
+      "loss": 0.5534,
+      "step": 2111
+    },
+    {
+      "epoch": 0.67584,
+      "grad_norm": 0.41554523742853994,
+      "learning_rate": 5.023955714457439e-05,
+      "loss": 0.64,
+      "step": 2112
+    },
+    {
+      "epoch": 0.67616,
+      "grad_norm": 0.3500498558145719,
+      "learning_rate": 5.0149678568755545e-05,
+      "loss": 0.598,
+      "step": 2113
+    },
+    {
+      "epoch": 0.67648,
+      "grad_norm": 0.3515305974392414,
+      "learning_rate": 5.005985354741543e-05,
+      "loss": 0.5571,
+      "step": 2114
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.34353499770713497,
+      "learning_rate": 4.99700821770536e-05,
+      "loss": 0.568,
+      "step": 2115
+    },
+    {
+      "epoch": 0.67712,
+      "grad_norm": 0.3287209481117013,
+      "learning_rate": 4.988036455411197e-05,
+      "loss": 0.5798,
+      "step": 2116
+    },
+    {
+      "epoch": 0.67744,
+      "grad_norm": 0.32849266582937475,
+      "learning_rate": 4.9790700774974605e-05,
+      "loss": 0.5828,
+      "step": 2117
+    },
+    {
+      "epoch": 0.67776,
+      "grad_norm": 0.3595703565476983,
+      "learning_rate": 4.97010909359679e-05,
+      "loss": 0.6021,
+      "step": 2118
+    },
+    {
+      "epoch": 0.67808,
+      "grad_norm": 0.6601814705838766,
+      "learning_rate": 4.961153513336011e-05,
+      "loss": 0.6094,
+      "step": 2119
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3317350540717859,
+      "learning_rate": 4.952203346336158e-05,
+      "loss": 0.542,
+      "step": 2120
+    },
+    {
+      "epoch": 0.67872,
+      "grad_norm": 0.3544282957512177,
+      "learning_rate": 4.9432586022124494e-05,
+      "loss": 0.6089,
+      "step": 2121
+    },
+    {
+      "epoch": 0.67904,
+      "grad_norm": 0.3654747649938245,
+      "learning_rate": 4.934319290574266e-05,
+      "loss": 0.5781,
+      "step": 2122
+    },
+    {
+      "epoch": 0.67936,
+      "grad_norm": 0.3118886344431738,
+      "learning_rate": 4.925385421025167e-05,
+      "loss": 0.5373,
+      "step": 2123
+    },
+    {
+      "epoch": 0.67968,
+      "grad_norm": 0.3451120407212237,
+      "learning_rate": 4.916457003162852e-05,
+      "loss": 0.6043,
+      "step": 2124
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.35425978467866664,
+      "learning_rate": 4.907534046579173e-05,
+      "loss": 0.6149,
+      "step": 2125
+    },
+    {
+      "epoch": 0.68032,
+      "grad_norm": 0.3482058751281036,
+      "learning_rate": 4.898616560860116e-05,
+      "loss": 0.5905,
+      "step": 2126
+    },
+    {
+      "epoch": 0.68064,
+      "grad_norm": 0.3737144520124246,
+      "learning_rate": 4.88970455558578e-05,
+      "loss": 0.6353,
+      "step": 2127
+    },
+    {
+      "epoch": 0.68096,
+      "grad_norm": 0.4134031552056952,
+      "learning_rate": 4.880798040330383e-05,
+      "loss": 0.5448,
+      "step": 2128
+    },
+    {
+      "epoch": 0.68128,
+      "grad_norm": 0.378894264940317,
+      "learning_rate": 4.8718970246622496e-05,
+      "loss": 0.5831,
+      "step": 2129
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3403382839635182,
+      "learning_rate": 4.8630015181437826e-05,
+      "loss": 0.592,
+      "step": 2130
+    },
+    {
+      "epoch": 0.68192,
+      "grad_norm": 0.32596992803050007,
+      "learning_rate": 4.8541115303314824e-05,
+      "loss": 0.6038,
+      "step": 2131
+    },
+    {
+      "epoch": 0.68224,
+      "grad_norm": 0.3523430624349347,
+      "learning_rate": 4.845227070775903e-05,
+      "loss": 0.5373,
+      "step": 2132
+    },
+    {
+      "epoch": 0.68256,
+      "grad_norm": 0.36324300667545956,
+      "learning_rate": 4.8363481490216754e-05,
+      "loss": 0.6118,
+      "step": 2133
+    },
+    {
+      "epoch": 0.68288,
+      "grad_norm": 0.35990878032248785,
+      "learning_rate": 4.827474774607478e-05,
+      "loss": 0.5571,
+      "step": 2134
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.35246368836583347,
+      "learning_rate": 4.8186069570660175e-05,
+      "loss": 0.6019,
+      "step": 2135
+    },
+    {
+      "epoch": 0.68352,
+      "grad_norm": 0.33820762145884403,
+      "learning_rate": 4.809744705924049e-05,
+      "loss": 0.5542,
+      "step": 2136
+    },
+    {
+      "epoch": 0.68384,
+      "grad_norm": 0.36396619324937507,
+      "learning_rate": 4.8008880307023286e-05,
+      "loss": 0.6071,
+      "step": 2137
+    },
+    {
+      "epoch": 0.68416,
+      "grad_norm": 0.34297769819882357,
+      "learning_rate": 4.792036940915642e-05,
+      "loss": 0.6004,
+      "step": 2138
+    },
+    {
+      "epoch": 0.68448,
+      "grad_norm": 0.3446707277874218,
+      "learning_rate": 4.783191446072757e-05,
+      "loss": 0.5795,
+      "step": 2139
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.34247701064963976,
+      "learning_rate": 4.774351555676443e-05,
+      "loss": 0.5997,
+      "step": 2140
+    },
+    {
+      "epoch": 0.68512,
+      "grad_norm": 0.34031480373250517,
+      "learning_rate": 4.7655172792234416e-05,
+      "loss": 0.5971,
+      "step": 2141
+    },
+    {
+      "epoch": 0.68544,
+      "grad_norm": 0.3418328397157584,
+      "learning_rate": 4.756688626204462e-05,
+      "loss": 0.5388,
+      "step": 2142
+    },
+    {
+      "epoch": 0.68576,
+      "grad_norm": 0.33830150878911225,
+      "learning_rate": 4.7478656061041785e-05,
+      "loss": 0.6027,
+      "step": 2143
+    },
+    {
+      "epoch": 0.68608,
+      "grad_norm": 0.3290347537066264,
+      "learning_rate": 4.7390482284012137e-05,
+      "loss": 0.5667,
+      "step": 2144
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.322758433693767,
+      "learning_rate": 4.7302365025681206e-05,
+      "loss": 0.5579,
+      "step": 2145
+    },
+    {
+      "epoch": 0.68672,
+      "grad_norm": 0.32916432132305706,
+      "learning_rate": 4.7214304380713883e-05,
+      "loss": 0.5601,
+      "step": 2146
+    },
+    {
+      "epoch": 0.68704,
+      "grad_norm": 0.35069694685103214,
+      "learning_rate": 4.7126300443714235e-05,
+      "loss": 0.6728,
+      "step": 2147
+    },
+    {
+      "epoch": 0.68736,
+      "grad_norm": 0.3372776585334965,
+      "learning_rate": 4.703835330922531e-05,
+      "loss": 0.5553,
+      "step": 2148
+    },
+    {
+      "epoch": 0.68768,
+      "grad_norm": 0.35081736607009983,
+      "learning_rate": 4.69504630717293e-05,
+      "loss": 0.5835,
+      "step": 2149
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3421663522711319,
+      "learning_rate": 4.686262982564709e-05,
+      "loss": 0.5625,
+      "step": 2150
+    },
+    {
+      "epoch": 0.68832,
+      "grad_norm": 0.330158782268883,
+      "learning_rate": 4.677485366533846e-05,
+      "loss": 0.5669,
+      "step": 2151
+    },
+    {
+      "epoch": 0.68864,
+      "grad_norm": 0.3322203301560736,
+      "learning_rate": 4.6687134685101866e-05,
+      "loss": 0.6449,
+      "step": 2152
+    },
+    {
+      "epoch": 0.68896,
+      "grad_norm": 0.3379636582895683,
+      "learning_rate": 4.6599472979174244e-05,
+      "loss": 0.5857,
+      "step": 2153
+    },
+    {
+      "epoch": 0.68928,
+      "grad_norm": 0.3717812856756649,
+      "learning_rate": 4.6511868641731104e-05,
+      "loss": 0.568,
+      "step": 2154
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.3515730424662826,
+      "learning_rate": 4.6424321766886215e-05,
+      "loss": 0.591,
+      "step": 2155
+    },
+    {
+      "epoch": 0.68992,
+      "grad_norm": 0.37438832198048366,
+      "learning_rate": 4.633683244869172e-05,
+      "loss": 0.6282,
+      "step": 2156
+    },
+    {
+      "epoch": 0.69024,
+      "grad_norm": 0.33930012347487964,
+      "learning_rate": 4.624940078113789e-05,
+      "loss": 0.567,
+      "step": 2157
+    },
+    {
+      "epoch": 0.69056,
+      "grad_norm": 0.35189737408355254,
+      "learning_rate": 4.616202685815299e-05,
+      "loss": 0.583,
+      "step": 2158
+    },
+    {
+      "epoch": 0.69088,
+      "grad_norm": 0.3600147955581338,
+      "learning_rate": 4.607471077360337e-05,
+      "loss": 0.6119,
+      "step": 2159
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.34892846844248593,
+      "learning_rate": 4.59874526212932e-05,
+      "loss": 0.5768,
+      "step": 2160
+    },
+    {
+      "epoch": 0.69152,
+      "grad_norm": 0.33590850093963653,
+      "learning_rate": 4.590025249496436e-05,
+      "loss": 0.6,
+      "step": 2161
+    },
+    {
+      "epoch": 0.69184,
+      "grad_norm": 0.3273754220673678,
+      "learning_rate": 4.581311048829646e-05,
+      "loss": 0.5733,
+      "step": 2162
+    },
+    {
+      "epoch": 0.69216,
+      "grad_norm": 0.36343403171179073,
+      "learning_rate": 4.572602669490661e-05,
+      "loss": 0.5946,
+      "step": 2163
+    },
+    {
+      "epoch": 0.69248,
+      "grad_norm": 0.3528486629323474,
+      "learning_rate": 4.563900120834946e-05,
+      "loss": 0.6293,
+      "step": 2164
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.33337786487290694,
+      "learning_rate": 4.5552034122116936e-05,
+      "loss": 0.5556,
+      "step": 2165
+    },
+    {
+      "epoch": 0.69312,
+      "grad_norm": 0.34427510895914637,
+      "learning_rate": 4.5465125529638305e-05,
+      "loss": 0.6029,
+      "step": 2166
+    },
+    {
+      "epoch": 0.69344,
+      "grad_norm": 0.353709840783666,
+      "learning_rate": 4.53782755242799e-05,
+      "loss": 0.5748,
+      "step": 2167
+    },
+    {
+      "epoch": 0.69376,
+      "grad_norm": 0.35153681066549636,
+      "learning_rate": 4.5291484199345234e-05,
+      "loss": 0.5909,
+      "step": 2168
+    },
+    {
+      "epoch": 0.69408,
+      "grad_norm": 0.35000499888657527,
+      "learning_rate": 4.5204751648074636e-05,
+      "loss": 0.5761,
+      "step": 2169
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3566588805630135,
+      "learning_rate": 4.5118077963645445e-05,
+      "loss": 0.5875,
+      "step": 2170
+    },
+    {
+      "epoch": 0.69472,
+      "grad_norm": 0.32566392052613685,
+      "learning_rate": 4.503146323917162e-05,
+      "loss": 0.5598,
+      "step": 2171
+    },
+    {
+      "epoch": 0.69504,
+      "grad_norm": 0.34817018348189954,
+      "learning_rate": 4.49449075677039e-05,
+      "loss": 0.606,
+      "step": 2172
+    },
+    {
+      "epoch": 0.69536,
+      "grad_norm": 0.3493169142378676,
+      "learning_rate": 4.4858411042229465e-05,
+      "loss": 0.5809,
+      "step": 2173
+    },
+    {
+      "epoch": 0.69568,
+      "grad_norm": 0.34416584714624077,
+      "learning_rate": 4.477197375567206e-05,
+      "loss": 0.5982,
+      "step": 2174
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3236725756890863,
+      "learning_rate": 4.468559580089175e-05,
+      "loss": 0.5704,
+      "step": 2175
+    },
+    {
+      "epoch": 0.69632,
+      "grad_norm": 0.3343878917653586,
+      "learning_rate": 4.4599277270684824e-05,
+      "loss": 0.5765,
+      "step": 2176
+    },
+    {
+      "epoch": 0.69664,
+      "grad_norm": 0.3372671578789749,
+      "learning_rate": 4.451301825778376e-05,
+      "loss": 0.5752,
+      "step": 2177
+    },
+    {
+      "epoch": 0.69696,
+      "grad_norm": 0.3148645999782816,
+      "learning_rate": 4.4426818854857155e-05,
+      "loss": 0.5564,
+      "step": 2178
+    },
+    {
+      "epoch": 0.69728,
+      "grad_norm": 0.3396740773552133,
+      "learning_rate": 4.4340679154509424e-05,
+      "loss": 0.5569,
+      "step": 2179
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3415364337182879,
+      "learning_rate": 4.4254599249281016e-05,
+      "loss": 0.5836,
+      "step": 2180
+    },
+    {
+      "epoch": 0.69792,
+      "grad_norm": 0.3946165939020095,
+      "learning_rate": 4.416857923164798e-05,
+      "loss": 0.6023,
+      "step": 2181
+    },
+    {
+      "epoch": 0.69824,
+      "grad_norm": 0.37093013746780495,
+      "learning_rate": 4.4082619194022115e-05,
+      "loss": 0.648,
+      "step": 2182
+    },
+    {
+      "epoch": 0.69856,
+      "grad_norm": 0.3450062761631485,
+      "learning_rate": 4.3996719228750826e-05,
+      "loss": 0.6124,
+      "step": 2183
+    },
+    {
+      "epoch": 0.69888,
+      "grad_norm": 0.34441174318580886,
+      "learning_rate": 4.391087942811685e-05,
+      "loss": 0.5923,
+      "step": 2184
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.36483531970324995,
+      "learning_rate": 4.382509988433844e-05,
+      "loss": 0.5991,
+      "step": 2185
+    },
+    {
+      "epoch": 0.69952,
+      "grad_norm": 0.3427550850478959,
+      "learning_rate": 4.3739380689568955e-05,
+      "loss": 0.5472,
+      "step": 2186
+    },
+    {
+      "epoch": 0.69984,
+      "grad_norm": 0.35052673086789415,
+      "learning_rate": 4.365372193589704e-05,
+      "loss": 0.587,
+      "step": 2187
+    },
+    {
+      "epoch": 0.70016,
+      "grad_norm": 0.3411611014102783,
+      "learning_rate": 4.356812371534643e-05,
+      "loss": 0.5848,
+      "step": 2188
+    },
+    {
+      "epoch": 0.70048,
+      "grad_norm": 0.32664012386565394,
+      "learning_rate": 4.348258611987568e-05,
+      "loss": 0.5444,
+      "step": 2189
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3227412475218645,
+      "learning_rate": 4.339710924137835e-05,
+      "loss": 0.5493,
+      "step": 2190
+    },
+    {
+      "epoch": 0.70112,
+      "grad_norm": 0.34809704278662396,
+      "learning_rate": 4.3311693171682765e-05,
+      "loss": 0.578,
+      "step": 2191
+    },
+    {
+      "epoch": 0.70144,
+      "grad_norm": 0.3614951388063878,
+      "learning_rate": 4.3226338002551806e-05,
+      "loss": 0.6186,
+      "step": 2192
+    },
+    {
+      "epoch": 0.70176,
+      "grad_norm": 0.33865981322451133,
+      "learning_rate": 4.314104382568308e-05,
+      "loss": 0.5779,
+      "step": 2193
+    },
+    {
+      "epoch": 0.70208,
+      "grad_norm": 0.34010188618196313,
+      "learning_rate": 4.305581073270858e-05,
+      "loss": 0.5643,
+      "step": 2194
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.3449882766085503,
+      "learning_rate": 4.297063881519463e-05,
+      "loss": 0.5997,
+      "step": 2195
+    },
+    {
+      "epoch": 0.70272,
+      "grad_norm": 0.3880839796316396,
+      "learning_rate": 4.2885528164642e-05,
+      "loss": 0.5456,
+      "step": 2196
+    },
+    {
+      "epoch": 0.70304,
+      "grad_norm": 0.3452914945529851,
+      "learning_rate": 4.280047887248544e-05,
+      "loss": 0.5778,
+      "step": 2197
+    },
+    {
+      "epoch": 0.70336,
+      "grad_norm": 0.34027732818941414,
+      "learning_rate": 4.271549103009396e-05,
+      "loss": 0.5757,
+      "step": 2198
+    },
+    {
+      "epoch": 0.70368,
+      "grad_norm": 0.3716579564549702,
+      "learning_rate": 4.2630564728770396e-05,
+      "loss": 0.5908,
+      "step": 2199
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3368628859692047,
+      "learning_rate": 4.25457000597516e-05,
+      "loss": 0.5946,
+      "step": 2200
+    },
+    {
+      "epoch": 0.70432,
+      "grad_norm": 0.34163950275425115,
+      "learning_rate": 4.2460897114208173e-05,
+      "loss": 0.6224,
+      "step": 2201
+    },
+    {
+      "epoch": 0.70464,
+      "grad_norm": 0.3504970826067996,
+      "learning_rate": 4.237615598324435e-05,
+      "loss": 0.5534,
+      "step": 2202
+    },
+    {
+      "epoch": 0.70496,
+      "grad_norm": 0.3572226016436674,
+      "learning_rate": 4.229147675789801e-05,
+      "loss": 0.58,
+      "step": 2203
+    },
+    {
+      "epoch": 0.70528,
+      "grad_norm": 0.34691137822027407,
+      "learning_rate": 4.220685952914057e-05,
+      "loss": 0.5924,
+      "step": 2204
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.33739703850569674,
+      "learning_rate": 4.212230438787671e-05,
+      "loss": 0.5333,
+      "step": 2205
+    },
+    {
+      "epoch": 0.70592,
+      "grad_norm": 0.3409260034699299,
+      "learning_rate": 4.2037811424944574e-05,
+      "loss": 0.5875,
+      "step": 2206
+    },
+    {
+      "epoch": 0.70624,
+      "grad_norm": 0.3427695122891294,
+      "learning_rate": 4.1953380731115346e-05,
+      "loss": 0.5939,
+      "step": 2207
+    },
+    {
+      "epoch": 0.70656,
+      "grad_norm": 0.33617391846693423,
+      "learning_rate": 4.1869012397093424e-05,
+      "loss": 0.5588,
+      "step": 2208
+    },
+    {
+      "epoch": 0.70688,
+      "grad_norm": 0.3390340944840326,
+      "learning_rate": 4.1784706513516214e-05,
+      "loss": 0.5594,
+      "step": 2209
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.32951850359568313,
+      "learning_rate": 4.170046317095393e-05,
+      "loss": 0.6086,
+      "step": 2210
+    },
+    {
+      "epoch": 0.70752,
+      "grad_norm": 0.34260843039120775,
+      "learning_rate": 4.161628245990975e-05,
+      "loss": 0.5784,
+      "step": 2211
+    },
+    {
+      "epoch": 0.70784,
+      "grad_norm": 0.325536739055374,
+      "learning_rate": 4.153216447081939e-05,
+      "loss": 0.5634,
+      "step": 2212
+    },
+    {
+      "epoch": 0.70816,
+      "grad_norm": 0.3386819031421406,
+      "learning_rate": 4.144810929405132e-05,
+      "loss": 0.5809,
+      "step": 2213
+    },
+    {
+      "epoch": 0.70848,
+      "grad_norm": 0.34231274950439816,
+      "learning_rate": 4.136411701990652e-05,
+      "loss": 0.5819,
+      "step": 2214
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3394653712190638,
+      "learning_rate": 4.12801877386183e-05,
+      "loss": 0.5837,
+      "step": 2215
+    },
+    {
+      "epoch": 0.70912,
+      "grad_norm": 0.35804269805922173,
+      "learning_rate": 4.119632154035241e-05,
+      "loss": 0.5952,
+      "step": 2216
+    },
+    {
+      "epoch": 0.70944,
+      "grad_norm": 0.3586488862051871,
+      "learning_rate": 4.111251851520671e-05,
+      "loss": 0.5905,
+      "step": 2217
+    },
+    {
+      "epoch": 0.70976,
+      "grad_norm": 0.3349094131065414,
+      "learning_rate": 4.102877875321129e-05,
+      "loss": 0.5509,
+      "step": 2218
+    },
+    {
+      "epoch": 0.71008,
+      "grad_norm": 0.36753271991452285,
+      "learning_rate": 4.09451023443283e-05,
+      "loss": 0.6154,
+      "step": 2219
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.37342710333992696,
+      "learning_rate": 4.086148937845167e-05,
+      "loss": 0.5742,
+      "step": 2220
+    },
+    {
+      "epoch": 0.71072,
+      "grad_norm": 0.36672738966642876,
+      "learning_rate": 4.0777939945407375e-05,
+      "loss": 0.5519,
+      "step": 2221
+    },
+    {
+      "epoch": 0.71104,
+      "grad_norm": 0.3599664623076852,
+      "learning_rate": 4.069445413495295e-05,
+      "loss": 0.6215,
+      "step": 2222
+    },
+    {
+      "epoch": 0.71136,
+      "grad_norm": 0.3749111463513934,
+      "learning_rate": 4.061103203677774e-05,
+      "loss": 0.6112,
+      "step": 2223
+    },
+    {
+      "epoch": 0.71168,
+      "grad_norm": 0.34354172664774496,
+      "learning_rate": 4.052767374050255e-05,
+      "loss": 0.5861,
+      "step": 2224
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3340136786370096,
+      "learning_rate": 4.04443793356796e-05,
+      "loss": 0.5552,
+      "step": 2225
+    },
+    {
+      "epoch": 0.71232,
+      "grad_norm": 0.3189664004815407,
+      "learning_rate": 4.03611489117926e-05,
+      "loss": 0.6129,
+      "step": 2226
+    },
+    {
+      "epoch": 0.71264,
+      "grad_norm": 0.3337073767832239,
+      "learning_rate": 4.027798255825648e-05,
+      "loss": 0.5344,
+      "step": 2227
+    },
+    {
+      "epoch": 0.71296,
+      "grad_norm": 0.332111140498855,
+      "learning_rate": 4.019488036441725e-05,
+      "loss": 0.5527,
+      "step": 2228
+    },
+    {
+      "epoch": 0.71328,
+      "grad_norm": 0.34633997300019187,
+      "learning_rate": 4.011184241955213e-05,
+      "loss": 0.5889,
+      "step": 2229
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.35046950969274326,
+      "learning_rate": 4.002886881286917e-05,
+      "loss": 0.5922,
+      "step": 2230
+    },
+    {
+      "epoch": 0.71392,
+      "grad_norm": 0.3418008484457355,
+      "learning_rate": 3.9945959633507435e-05,
+      "loss": 0.5939,
+      "step": 2231
+    },
+    {
+      "epoch": 0.71424,
+      "grad_norm": 0.3332284797781076,
+      "learning_rate": 3.986311497053673e-05,
+      "loss": 0.584,
+      "step": 2232
+    },
+    {
+      "epoch": 0.71456,
+      "grad_norm": 0.370652608795854,
+      "learning_rate": 3.97803349129575e-05,
+      "loss": 0.5694,
+      "step": 2233
+    },
+    {
+      "epoch": 0.71488,
+      "grad_norm": 0.353716723198577,
+      "learning_rate": 3.969761954970082e-05,
+      "loss": 0.6438,
+      "step": 2234
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.349414260786171,
+      "learning_rate": 3.961496896962832e-05,
+      "loss": 0.5817,
+      "step": 2235
+    },
+    {
+      "epoch": 0.71552,
+      "grad_norm": 0.327908196522291,
+      "learning_rate": 3.953238326153193e-05,
+      "loss": 0.573,
+      "step": 2236
+    },
+    {
+      "epoch": 0.71584,
+      "grad_norm": 0.34876399927957835,
+      "learning_rate": 3.9449862514133975e-05,
+      "loss": 0.6092,
+      "step": 2237
+    },
+    {
+      "epoch": 0.71616,
+      "grad_norm": 0.33970579566706016,
+      "learning_rate": 3.936740681608689e-05,
+      "loss": 0.5903,
+      "step": 2238
+    },
+    {
+      "epoch": 0.71648,
+      "grad_norm": 0.3467371703056036,
+      "learning_rate": 3.928501625597335e-05,
+      "loss": 0.5812,
+      "step": 2239
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3618289787290079,
+      "learning_rate": 3.920269092230601e-05,
+      "loss": 0.5753,
+      "step": 2240
+    },
+    {
+      "epoch": 0.71712,
+      "grad_norm": 0.33453430651132277,
+      "learning_rate": 3.912043090352737e-05,
+      "loss": 0.5742,
+      "step": 2241
+    },
+    {
+      "epoch": 0.71744,
+      "grad_norm": 0.33237639957635434,
+      "learning_rate": 3.90382362880099e-05,
+      "loss": 0.5806,
+      "step": 2242
+    },
+    {
+      "epoch": 0.71776,
+      "grad_norm": 0.36920254349985016,
+      "learning_rate": 3.8956107164055656e-05,
+      "loss": 0.6318,
+      "step": 2243
+    },
+    {
+      "epoch": 0.71808,
+      "grad_norm": 0.33312389210356563,
+      "learning_rate": 3.887404361989646e-05,
+      "loss": 0.6117,
+      "step": 2244
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3624401730891918,
+      "learning_rate": 3.8792045743693674e-05,
+      "loss": 0.6194,
+      "step": 2245
+    },
+    {
+      "epoch": 0.71872,
+      "grad_norm": 0.3812328820490707,
+      "learning_rate": 3.871011362353798e-05,
+      "loss": 0.6183,
+      "step": 2246
+    },
+    {
+      "epoch": 0.71904,
+      "grad_norm": 0.34115584079319067,
+      "learning_rate": 3.862824734744961e-05,
+      "loss": 0.5807,
+      "step": 2247
+    },
+    {
+      "epoch": 0.71936,
+      "grad_norm": 0.3277684077637942,
+      "learning_rate": 3.854644700337788e-05,
+      "loss": 0.5814,
+      "step": 2248
+    },
+    {
+      "epoch": 0.71968,
+      "grad_norm": 0.3576097084128524,
+      "learning_rate": 3.846471267920143e-05,
+      "loss": 0.5993,
+      "step": 2249
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3411302003205912,
+      "learning_rate": 3.8383044462727826e-05,
+      "loss": 0.6164,
+      "step": 2250
+    },
+    {
+      "epoch": 0.72032,
+      "grad_norm": 0.37426285149209554,
+      "learning_rate": 3.830144244169377e-05,
+      "loss": 0.6164,
+      "step": 2251
+    },
+    {
+      "epoch": 0.72064,
+      "grad_norm": 0.37352400249170115,
+      "learning_rate": 3.821990670376468e-05,
+      "loss": 0.598,
+      "step": 2252
+    },
+    {
+      "epoch": 0.72096,
+      "grad_norm": 0.33679111597156436,
+      "learning_rate": 3.813843733653494e-05,
+      "loss": 0.6082,
+      "step": 2253
+    },
+    {
+      "epoch": 0.72128,
+      "grad_norm": 0.3572988792391773,
+      "learning_rate": 3.805703442752747e-05,
+      "loss": 0.5666,
+      "step": 2254
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3321327187613777,
+      "learning_rate": 3.797569806419394e-05,
+      "loss": 0.6219,
+      "step": 2255
+    },
+    {
+      "epoch": 0.72192,
+      "grad_norm": 0.34449603882458674,
+      "learning_rate": 3.78944283339144e-05,
+      "loss": 0.6428,
+      "step": 2256
+    },
+    {
+      "epoch": 0.72224,
+      "grad_norm": 0.36480563636854263,
+      "learning_rate": 3.7813225323997394e-05,
+      "loss": 0.6036,
+      "step": 2257
+    },
+    {
+      "epoch": 0.72256,
+      "grad_norm": 0.41505054186153895,
+      "learning_rate": 3.77320891216798e-05,
+      "loss": 0.5695,
+      "step": 2258
+    },
+    {
+      "epoch": 0.72288,
+      "grad_norm": 0.3376316800430768,
+      "learning_rate": 3.7651019814126654e-05,
+      "loss": 0.6033,
+      "step": 2259
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.35426071634403483,
+      "learning_rate": 3.757001748843121e-05,
+      "loss": 0.5696,
+      "step": 2260
+    },
+    {
+      "epoch": 0.72352,
+      "grad_norm": 0.35349342910908493,
+      "learning_rate": 3.748908223161466e-05,
+      "loss": 0.6225,
+      "step": 2261
+    },
+    {
+      "epoch": 0.72384,
+      "grad_norm": 0.3660884093482465,
+      "learning_rate": 3.7408214130626226e-05,
+      "loss": 0.609,
+      "step": 2262
+    },
+    {
+      "epoch": 0.72416,
+      "grad_norm": 0.36732128749850107,
+      "learning_rate": 3.732741327234301e-05,
+      "loss": 0.6025,
+      "step": 2263
+    },
+    {
+      "epoch": 0.72448,
+      "grad_norm": 0.32087709387809354,
+      "learning_rate": 3.7246679743569736e-05,
+      "loss": 0.5603,
+      "step": 2264
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3918461582081944,
+      "learning_rate": 3.716601363103894e-05,
+      "loss": 0.5982,
+      "step": 2265
+    },
+    {
+      "epoch": 0.72512,
+      "grad_norm": 0.3382516923879686,
+      "learning_rate": 3.7085415021410706e-05,
+      "loss": 0.5844,
+      "step": 2266
+    },
+    {
+      "epoch": 0.72544,
+      "grad_norm": 0.3586032250212653,
+      "learning_rate": 3.7004884001272486e-05,
+      "loss": 0.5933,
+      "step": 2267
+    },
+    {
+      "epoch": 0.72576,
+      "grad_norm": 0.3578244878400089,
+      "learning_rate": 3.6924420657139304e-05,
+      "loss": 0.6197,
+      "step": 2268
+    },
+    {
+      "epoch": 0.72608,
+      "grad_norm": 0.3274635023599598,
+      "learning_rate": 3.684402507545329e-05,
+      "loss": 0.586,
+      "step": 2269
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3620469443544567,
+      "learning_rate": 3.6763697342583905e-05,
+      "loss": 0.5895,
+      "step": 2270
+    },
+    {
+      "epoch": 0.72672,
+      "grad_norm": 0.34616989961039274,
+      "learning_rate": 3.66834375448277e-05,
+      "loss": 0.5577,
+      "step": 2271
+    },
+    {
+      "epoch": 0.72704,
+      "grad_norm": 0.3198872973298801,
+      "learning_rate": 3.660324576840819e-05,
+      "loss": 0.5394,
+      "step": 2272
+    },
+    {
+      "epoch": 0.72736,
+      "grad_norm": 0.32009968478428624,
+      "learning_rate": 3.652312209947589e-05,
+      "loss": 0.5802,
+      "step": 2273
+    },
+    {
+      "epoch": 0.72768,
+      "grad_norm": 0.36490577158852383,
+      "learning_rate": 3.644306662410805e-05,
+      "loss": 0.6055,
+      "step": 2274
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.36374517482268487,
+      "learning_rate": 3.6363079428308776e-05,
+      "loss": 0.6059,
+      "step": 2275
+    },
+    {
+      "epoch": 0.72832,
+      "grad_norm": 0.34302612410676386,
+      "learning_rate": 3.628316059800868e-05,
+      "loss": 0.5782,
+      "step": 2276
+    },
+    {
+      "epoch": 0.72864,
+      "grad_norm": 0.3668817865393265,
+      "learning_rate": 3.62033102190651e-05,
+      "loss": 0.5935,
+      "step": 2277
+    },
+    {
+      "epoch": 0.72896,
+      "grad_norm": 0.362158383809741,
+      "learning_rate": 3.612352837726166e-05,
+      "loss": 0.5743,
+      "step": 2278
+    },
+    {
+      "epoch": 0.72928,
+      "grad_norm": 0.33252307212965837,
+      "learning_rate": 3.6043815158308516e-05,
+      "loss": 0.5968,
+      "step": 2279
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.34244771372003796,
+      "learning_rate": 3.5964170647841943e-05,
+      "loss": 0.5985,
+      "step": 2280
+    },
+    {
+      "epoch": 0.72992,
+      "grad_norm": 0.34611776184090504,
+      "learning_rate": 3.588459493142456e-05,
+      "loss": 0.5963,
+      "step": 2281
+    },
+    {
+      "epoch": 0.73024,
+      "grad_norm": 0.3562953662835525,
+      "learning_rate": 3.580508809454494e-05,
+      "loss": 0.5682,
+      "step": 2282
+    },
+    {
+      "epoch": 0.73056,
+      "grad_norm": 0.3277345038138114,
+      "learning_rate": 3.572565022261775e-05,
+      "loss": 0.5756,
+      "step": 2283
+    },
+    {
+      "epoch": 0.73088,
+      "grad_norm": 0.32874532005793394,
+      "learning_rate": 3.5646281400983574e-05,
+      "loss": 0.5859,
+      "step": 2284
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3401882822854291,
+      "learning_rate": 3.556698171490871e-05,
+      "loss": 0.5899,
+      "step": 2285
+    },
+    {
+      "epoch": 0.73152,
+      "grad_norm": 0.405455404993615,
+      "learning_rate": 3.548775124958532e-05,
+      "loss": 0.5457,
+      "step": 2286
+    },
+    {
+      "epoch": 0.73184,
+      "grad_norm": 0.391038851640563,
+      "learning_rate": 3.540859009013108e-05,
+      "loss": 0.6236,
+      "step": 2287
+    },
+    {
+      "epoch": 0.73216,
+      "grad_norm": 0.3603780442231415,
+      "learning_rate": 3.532949832158928e-05,
+      "loss": 0.5599,
+      "step": 2288
+    },
+    {
+      "epoch": 0.73248,
+      "grad_norm": 0.35753046410297185,
+      "learning_rate": 3.5250476028928715e-05,
+      "loss": 0.5793,
+      "step": 2289
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.325083022215054,
+      "learning_rate": 3.517152329704337e-05,
+      "loss": 0.5893,
+      "step": 2290
+    },
+    {
+      "epoch": 0.73312,
+      "grad_norm": 0.33934670102174425,
+      "learning_rate": 3.509264021075269e-05,
+      "loss": 0.5592,
+      "step": 2291
+    },
+    {
+      "epoch": 0.73344,
+      "grad_norm": 0.3629510582047156,
+      "learning_rate": 3.501382685480116e-05,
+      "loss": 0.5867,
+      "step": 2292
+    },
+    {
+      "epoch": 0.73376,
+      "grad_norm": 0.33460678087081486,
+      "learning_rate": 3.493508331385842e-05,
+      "loss": 0.5617,
+      "step": 2293
+    },
+    {
+      "epoch": 0.73408,
+      "grad_norm": 0.36053140140845014,
+      "learning_rate": 3.485640967251914e-05,
+      "loss": 0.5853,
+      "step": 2294
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.38112213895245,
+      "learning_rate": 3.4777806015302796e-05,
+      "loss": 0.6554,
+      "step": 2295
+    },
+    {
+      "epoch": 0.73472,
+      "grad_norm": 0.3155893188801853,
+      "learning_rate": 3.469927242665375e-05,
+      "loss": 0.5347,
+      "step": 2296
+    },
+    {
+      "epoch": 0.73504,
+      "grad_norm": 0.33097273265734717,
+      "learning_rate": 3.462080899094111e-05,
+      "loss": 0.6048,
+      "step": 2297
+    },
+    {
+      "epoch": 0.73536,
+      "grad_norm": 0.32406528689576003,
+      "learning_rate": 3.454241579245854e-05,
+      "loss": 0.5445,
+      "step": 2298
+    },
+    {
+      "epoch": 0.73568,
+      "grad_norm": 0.3352555994003734,
+      "learning_rate": 3.446409291542433e-05,
+      "loss": 0.5804,
+      "step": 2299
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.33904697657405386,
+      "learning_rate": 3.438584044398113e-05,
+      "loss": 0.5923,
+      "step": 2300
+    },
+    {
+      "epoch": 0.73632,
+      "grad_norm": 0.3274710727691608,
+      "learning_rate": 3.430765846219603e-05,
+      "loss": 0.5771,
+      "step": 2301
+    },
+    {
+      "epoch": 0.73664,
+      "grad_norm": 0.3312216807151814,
+      "learning_rate": 3.422954705406043e-05,
+      "loss": 0.5868,
+      "step": 2302
+    },
+    {
+      "epoch": 0.73696,
+      "grad_norm": 0.3505180134891135,
+      "learning_rate": 3.415150630348977e-05,
+      "loss": 0.5796,
+      "step": 2303
+    },
+    {
+      "epoch": 0.73728,
+      "grad_norm": 0.3376211050147904,
+      "learning_rate": 3.4073536294323705e-05,
+      "loss": 0.5807,
+      "step": 2304
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3384012217882616,
+      "learning_rate": 3.399563711032583e-05,
+      "loss": 0.5528,
+      "step": 2305
+    },
+    {
+      "epoch": 0.73792,
+      "grad_norm": 0.32379561738608786,
+      "learning_rate": 3.3917808835183706e-05,
+      "loss": 0.6018,
+      "step": 2306
+    },
+    {
+      "epoch": 0.73824,
+      "grad_norm": 0.33816519346897606,
+      "learning_rate": 3.384005155250867e-05,
+      "loss": 0.6139,
+      "step": 2307
+    },
+    {
+      "epoch": 0.73856,
+      "grad_norm": 0.3361173981310898,
+      "learning_rate": 3.376236534583576e-05,
+      "loss": 0.5965,
+      "step": 2308
+    },
+    {
+      "epoch": 0.73888,
+      "grad_norm": 0.376368344839782,
+      "learning_rate": 3.368475029862373e-05,
+      "loss": 0.6307,
+      "step": 2309
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3400849938851062,
+      "learning_rate": 3.3607206494254914e-05,
+      "loss": 0.6181,
+      "step": 2310
+    },
+    {
+      "epoch": 0.73952,
+      "grad_norm": 0.3465621007928963,
+      "learning_rate": 3.352973401603499e-05,
+      "loss": 0.5998,
+      "step": 2311
+    },
+    {
+      "epoch": 0.73984,
+      "grad_norm": 0.3606098793187089,
+      "learning_rate": 3.34523329471931e-05,
+      "loss": 0.5891,
+      "step": 2312
+    },
+    {
+      "epoch": 0.74016,
+      "grad_norm": 0.33224624071446074,
+      "learning_rate": 3.337500337088162e-05,
+      "loss": 0.5275,
+      "step": 2313
+    },
+    {
+      "epoch": 0.74048,
+      "grad_norm": 0.3252397944789401,
+      "learning_rate": 3.329774537017616e-05,
+      "loss": 0.5804,
+      "step": 2314
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.35389135430673174,
+      "learning_rate": 3.322055902807545e-05,
+      "loss": 0.5961,
+      "step": 2315
+    },
+    {
+      "epoch": 0.74112,
+      "grad_norm": 0.3349695026013135,
+      "learning_rate": 3.314344442750116e-05,
+      "loss": 0.5687,
+      "step": 2316
+    },
+    {
+      "epoch": 0.74144,
+      "grad_norm": 0.34821436421442353,
+      "learning_rate": 3.306640165129799e-05,
+      "loss": 0.639,
+      "step": 2317
+    },
+    {
+      "epoch": 0.74176,
+      "grad_norm": 0.3422837278270685,
+      "learning_rate": 3.298943078223334e-05,
+      "loss": 0.5472,
+      "step": 2318
+    },
+    {
+      "epoch": 0.74208,
+      "grad_norm": 0.37652125465124586,
+      "learning_rate": 3.2912531902997524e-05,
+      "loss": 0.6127,
+      "step": 2319
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3417975229253376,
+      "learning_rate": 3.283570509620344e-05,
+      "loss": 0.5816,
+      "step": 2320
+    },
+    {
+      "epoch": 0.74272,
+      "grad_norm": 0.336177798926716,
+      "learning_rate": 3.275895044438649e-05,
+      "loss": 0.5837,
+      "step": 2321
+    },
+    {
+      "epoch": 0.74304,
+      "grad_norm": 0.3697089660276437,
+      "learning_rate": 3.26822680300047e-05,
+      "loss": 0.5831,
+      "step": 2322
+    },
+    {
+      "epoch": 0.74336,
+      "grad_norm": 0.3402634791319304,
+      "learning_rate": 3.260565793543835e-05,
+      "loss": 0.5591,
+      "step": 2323
+    },
+    {
+      "epoch": 0.74368,
+      "grad_norm": 0.3717472540096618,
+      "learning_rate": 3.252912024299012e-05,
+      "loss": 0.5955,
+      "step": 2324
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3514658273470325,
+      "learning_rate": 3.24526550348849e-05,
+      "loss": 0.6062,
+      "step": 2325
+    },
+    {
+      "epoch": 0.74432,
+      "grad_norm": 0.33652106281200506,
+      "learning_rate": 3.237626239326965e-05,
+      "loss": 0.5467,
+      "step": 2326
+    },
+    {
+      "epoch": 0.74464,
+      "grad_norm": 0.32989357684968657,
+      "learning_rate": 3.2299942400213446e-05,
+      "loss": 0.6034,
+      "step": 2327
+    },
+    {
+      "epoch": 0.74496,
+      "grad_norm": 0.32928431954703236,
+      "learning_rate": 3.222369513770729e-05,
+      "loss": 0.5549,
+      "step": 2328
+    },
+    {
+      "epoch": 0.74528,
+      "grad_norm": 0.3451671890137395,
+      "learning_rate": 3.214752068766399e-05,
+      "loss": 0.5612,
+      "step": 2329
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3415621954459219,
+      "learning_rate": 3.207141913191826e-05,
+      "loss": 0.6022,
+      "step": 2330
+    },
+    {
+      "epoch": 0.74592,
+      "grad_norm": 0.3989017659278773,
+      "learning_rate": 3.1995390552226336e-05,
+      "loss": 0.6029,
+      "step": 2331
+    },
+    {
+      "epoch": 0.74624,
+      "grad_norm": 0.381837055963759,
+      "learning_rate": 3.191943503026622e-05,
+      "loss": 0.6244,
+      "step": 2332
+    },
+    {
+      "epoch": 0.74656,
+      "grad_norm": 0.33126585934733654,
+      "learning_rate": 3.184355264763731e-05,
+      "loss": 0.6068,
+      "step": 2333
+    },
+    {
+      "epoch": 0.74688,
+      "grad_norm": 0.3536341899877415,
+      "learning_rate": 3.176774348586051e-05,
+      "loss": 0.616,
+      "step": 2334
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.3301539930652797,
+      "learning_rate": 3.1692007626377985e-05,
+      "loss": 0.5831,
+      "step": 2335
+    },
+    {
+      "epoch": 0.74752,
+      "grad_norm": 0.3369603625025938,
+      "learning_rate": 3.161634515055323e-05,
+      "loss": 0.5537,
+      "step": 2336
+    },
+    {
+      "epoch": 0.74784,
+      "grad_norm": 0.34788287232024545,
+      "learning_rate": 3.154075613967082e-05,
+      "loss": 0.5715,
+      "step": 2337
+    },
+    {
+      "epoch": 0.74816,
+      "grad_norm": 0.34199407523362596,
+      "learning_rate": 3.1465240674936514e-05,
+      "loss": 0.5558,
+      "step": 2338
+    },
+    {
+      "epoch": 0.74848,
+      "grad_norm": 0.3450282930035882,
+      "learning_rate": 3.138979883747692e-05,
+      "loss": 0.6083,
+      "step": 2339
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.34494890222555324,
+      "learning_rate": 3.131443070833968e-05,
+      "loss": 0.5348,
+      "step": 2340
+    },
+    {
+      "epoch": 0.74912,
+      "grad_norm": 0.33275235016148924,
+      "learning_rate": 3.1239136368493216e-05,
+      "loss": 0.542,
+      "step": 2341
+    },
+    {
+      "epoch": 0.74944,
+      "grad_norm": 0.33857138631779304,
+      "learning_rate": 3.116391589882659e-05,
+      "loss": 0.5713,
+      "step": 2342
+    },
+    {
+      "epoch": 0.74976,
+      "grad_norm": 0.33607981110956336,
+      "learning_rate": 3.108876938014964e-05,
+      "loss": 0.6235,
+      "step": 2343
+    },
+    {
+      "epoch": 0.75008,
+      "grad_norm": 0.3468769088146216,
+      "learning_rate": 3.101369689319263e-05,
+      "loss": 0.5832,
+      "step": 2344
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.3316342090040698,
+      "learning_rate": 3.093869851860638e-05,
+      "loss": 0.5911,
+      "step": 2345
+    },
+    {
+      "epoch": 0.75072,
+      "grad_norm": 0.3581437347975455,
+      "learning_rate": 3.08637743369621e-05,
+      "loss": 0.5667,
+      "step": 2346
+    },
+    {
+      "epoch": 0.75104,
+      "grad_norm": 0.3465195910789064,
+      "learning_rate": 3.078892442875119e-05,
+      "loss": 0.5399,
+      "step": 2347
+    },
+    {
+      "epoch": 0.75136,
+      "grad_norm": 0.3556462190377705,
+      "learning_rate": 3.071414887438537e-05,
+      "loss": 0.5732,
+      "step": 2348
+    },
+    {
+      "epoch": 0.75168,
+      "grad_norm": 0.34595846905129507,
+      "learning_rate": 3.063944775419641e-05,
+      "loss": 0.5484,
+      "step": 2349
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3446135367032066,
+      "learning_rate": 3.056482114843614e-05,
+      "loss": 0.5553,
+      "step": 2350
+    },
+    {
+      "epoch": 0.75232,
+      "grad_norm": 0.3607712135522439,
+      "learning_rate": 3.0490269137276394e-05,
+      "loss": 0.626,
+      "step": 2351
+    },
+    {
+      "epoch": 0.75264,
+      "grad_norm": 0.3402565405559765,
+      "learning_rate": 3.0415791800808723e-05,
+      "loss": 0.6187,
+      "step": 2352
+    },
+    {
+      "epoch": 0.75296,
+      "grad_norm": 0.3295917465282104,
+      "learning_rate": 3.0341389219044615e-05,
+      "loss": 0.5493,
+      "step": 2353
+    },
+    {
+      "epoch": 0.75328,
+      "grad_norm": 0.3519275515043085,
+      "learning_rate": 3.026706147191517e-05,
+      "loss": 0.5767,
+      "step": 2354
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.3408333723271029,
+      "learning_rate": 3.0192808639271065e-05,
+      "loss": 0.6295,
+      "step": 2355
+    },
+    {
+      "epoch": 0.75392,
+      "grad_norm": 0.3470666538922749,
+      "learning_rate": 3.0118630800882596e-05,
+      "loss": 0.5814,
+      "step": 2356
+    },
+    {
+      "epoch": 0.75424,
+      "grad_norm": 0.31666937870659334,
+      "learning_rate": 3.0044528036439357e-05,
+      "loss": 0.5683,
+      "step": 2357
+    },
+    {
+      "epoch": 0.75456,
+      "grad_norm": 0.33637773717051944,
+      "learning_rate": 2.9970500425550417e-05,
+      "loss": 0.5963,
+      "step": 2358
+    },
+    {
+      "epoch": 0.75488,
+      "grad_norm": 0.33786096292751916,
+      "learning_rate": 2.989654804774401e-05,
+      "loss": 0.5816,
+      "step": 2359
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.32677482985465583,
+      "learning_rate": 2.9822670982467637e-05,
+      "loss": 0.5942,
+      "step": 2360
+    },
+    {
+      "epoch": 0.75552,
+      "grad_norm": 0.3338809730277252,
+      "learning_rate": 2.9748869309087778e-05,
+      "loss": 0.5483,
+      "step": 2361
+    },
+    {
+      "epoch": 0.75584,
+      "grad_norm": 0.33174851524524673,
+      "learning_rate": 2.9675143106890056e-05,
+      "loss": 0.5796,
+      "step": 2362
+    },
+    {
+      "epoch": 0.75616,
+      "grad_norm": 0.3324989595799846,
+      "learning_rate": 2.9601492455078872e-05,
+      "loss": 0.6124,
+      "step": 2363
+    },
+    {
+      "epoch": 0.75648,
+      "grad_norm": 0.3362730718367246,
+      "learning_rate": 2.9527917432777595e-05,
+      "loss": 0.603,
+      "step": 2364
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.3617593581459504,
+      "learning_rate": 2.9454418119028227e-05,
+      "loss": 0.603,
+      "step": 2365
+    },
+    {
+      "epoch": 0.75712,
+      "grad_norm": 0.3471047988231789,
+      "learning_rate": 2.9380994592791545e-05,
+      "loss": 0.5695,
+      "step": 2366
+    },
+    {
+      "epoch": 0.75744,
+      "grad_norm": 0.3400950061793918,
+      "learning_rate": 2.9307646932946797e-05,
+      "loss": 0.5897,
+      "step": 2367
+    },
+    {
+      "epoch": 0.75776,
+      "grad_norm": 0.348662092469313,
+      "learning_rate": 2.923437521829181e-05,
+      "loss": 0.5527,
+      "step": 2368
+    },
+    {
+      "epoch": 0.75808,
+      "grad_norm": 0.3236855574460617,
+      "learning_rate": 2.9161179527542827e-05,
+      "loss": 0.5643,
+      "step": 2369
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.34981066030108027,
+      "learning_rate": 2.9088059939334332e-05,
+      "loss": 0.6093,
+      "step": 2370
+    },
+    {
+      "epoch": 0.75872,
+      "grad_norm": 0.3289082649092464,
+      "learning_rate": 2.9015016532219132e-05,
+      "loss": 0.5921,
+      "step": 2371
+    },
+    {
+      "epoch": 0.75904,
+      "grad_norm": 0.32879029315275066,
+      "learning_rate": 2.89420493846682e-05,
+      "loss": 0.5972,
+      "step": 2372
+    },
+    {
+      "epoch": 0.75936,
+      "grad_norm": 0.32476863522355104,
+      "learning_rate": 2.8869158575070488e-05,
+      "loss": 0.5829,
+      "step": 2373
+    },
+    {
+      "epoch": 0.75968,
+      "grad_norm": 0.33679510274302665,
+      "learning_rate": 2.8796344181733058e-05,
+      "loss": 0.5977,
+      "step": 2374
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.36180129427645297,
+      "learning_rate": 2.8723606282880765e-05,
+      "loss": 0.6201,
+      "step": 2375
+    },
+    {
+      "epoch": 0.76032,
+      "grad_norm": 0.3343661758410183,
+      "learning_rate": 2.865094495665638e-05,
+      "loss": 0.5824,
+      "step": 2376
+    },
+    {
+      "epoch": 0.76064,
+      "grad_norm": 0.352082421340757,
+      "learning_rate": 2.8578360281120377e-05,
+      "loss": 0.5986,
+      "step": 2377
+    },
+    {
+      "epoch": 0.76096,
+      "grad_norm": 0.35789151503896116,
+      "learning_rate": 2.8505852334250825e-05,
+      "loss": 0.6062,
+      "step": 2378
+    },
+    {
+      "epoch": 0.76128,
+      "grad_norm": 0.3685475388743482,
+      "learning_rate": 2.84334211939435e-05,
+      "loss": 0.6083,
+      "step": 2379
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3340106042654049,
+      "learning_rate": 2.836106693801148e-05,
+      "loss": 0.5777,
+      "step": 2380
+    },
+    {
+      "epoch": 0.76192,
+      "grad_norm": 0.38552664950055376,
+      "learning_rate": 2.828878964418542e-05,
+      "loss": 0.5968,
+      "step": 2381
+    },
+    {
+      "epoch": 0.76224,
+      "grad_norm": 0.3445694135476144,
+      "learning_rate": 2.8216589390113214e-05,
+      "loss": 0.5659,
+      "step": 2382
+    },
+    {
+      "epoch": 0.76256,
+      "grad_norm": 0.3295777802569321,
+      "learning_rate": 2.814446625335997e-05,
+      "loss": 0.5686,
+      "step": 2383
+    },
+    {
+      "epoch": 0.76288,
+      "grad_norm": 0.3446027220236407,
+      "learning_rate": 2.8072420311407977e-05,
+      "loss": 0.6051,
+      "step": 2384
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.33973972123166163,
+      "learning_rate": 2.8000451641656633e-05,
+      "loss": 0.5734,
+      "step": 2385
+    },
+    {
+      "epoch": 0.76352,
+      "grad_norm": 0.36079199185827376,
+      "learning_rate": 2.7928560321422237e-05,
+      "loss": 0.5711,
+      "step": 2386
+    },
+    {
+      "epoch": 0.76384,
+      "grad_norm": 0.34253463173699983,
+      "learning_rate": 2.7856746427938073e-05,
+      "loss": 0.5762,
+      "step": 2387
+    },
+    {
+      "epoch": 0.76416,
+      "grad_norm": 0.35533724153056895,
+      "learning_rate": 2.7785010038354197e-05,
+      "loss": 0.5945,
+      "step": 2388
+    },
+    {
+      "epoch": 0.76448,
+      "grad_norm": 0.34504298045201975,
+      "learning_rate": 2.7713351229737372e-05,
+      "loss": 0.5778,
+      "step": 2389
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3487361865298093,
+      "learning_rate": 2.7641770079071127e-05,
+      "loss": 0.5675,
+      "step": 2390
+    },
+    {
+      "epoch": 0.76512,
+      "grad_norm": 0.3452140897641801,
+      "learning_rate": 2.7570266663255417e-05,
+      "loss": 0.5891,
+      "step": 2391
+    },
+    {
+      "epoch": 0.76544,
+      "grad_norm": 0.32948686627018636,
+      "learning_rate": 2.7498841059106827e-05,
+      "loss": 0.5769,
+      "step": 2392
+    },
+    {
+      "epoch": 0.76576,
+      "grad_norm": 0.3430492211172593,
+      "learning_rate": 2.7427493343358224e-05,
+      "loss": 0.5989,
+      "step": 2393
+    },
+    {
+      "epoch": 0.76608,
+      "grad_norm": 0.33875464109493547,
+      "learning_rate": 2.735622359265889e-05,
+      "loss": 0.5895,
+      "step": 2394
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.3439041615207522,
+      "learning_rate": 2.728503188357434e-05,
+      "loss": 0.5534,
+      "step": 2395
+    },
+    {
+      "epoch": 0.76672,
+      "grad_norm": 0.34886016012964727,
+      "learning_rate": 2.7213918292586173e-05,
+      "loss": 0.5861,
+      "step": 2396
+    },
+    {
+      "epoch": 0.76704,
+      "grad_norm": 0.32669939047169744,
+      "learning_rate": 2.714288289609217e-05,
+      "loss": 0.6137,
+      "step": 2397
+    },
+    {
+      "epoch": 0.76736,
+      "grad_norm": 0.36101603744040395,
+      "learning_rate": 2.7071925770405992e-05,
+      "loss": 0.5998,
+      "step": 2398
+    },
+    {
+      "epoch": 0.76768,
+      "grad_norm": 0.3328698049958207,
+      "learning_rate": 2.700104699175732e-05,
+      "loss": 0.5831,
+      "step": 2399
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3510927935259629,
+      "learning_rate": 2.6930246636291635e-05,
+      "loss": 0.5856,
+      "step": 2400
+    },
+    {
+      "epoch": 0.76832,
+      "grad_norm": 0.3312101945503044,
+      "learning_rate": 2.68595247800701e-05,
+      "loss": 0.5997,
+      "step": 2401
+    },
+    {
+      "epoch": 0.76864,
+      "grad_norm": 0.3430926524554503,
+      "learning_rate": 2.678888149906964e-05,
+      "loss": 0.6051,
+      "step": 2402
+    },
+    {
+      "epoch": 0.76896,
+      "grad_norm": 0.3358767848755449,
+      "learning_rate": 2.6718316869182735e-05,
+      "loss": 0.5756,
+      "step": 2403
+    },
+    {
+      "epoch": 0.76928,
+      "grad_norm": 0.3391095082693979,
+      "learning_rate": 2.6647830966217325e-05,
+      "loss": 0.5882,
+      "step": 2404
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.341842787369802,
+      "learning_rate": 2.6577423865896856e-05,
+      "loss": 0.557,
+      "step": 2405
+    },
+    {
+      "epoch": 0.76992,
+      "grad_norm": 0.3239114474441497,
+      "learning_rate": 2.650709564386e-05,
+      "loss": 0.577,
+      "step": 2406
+    },
+    {
+      "epoch": 0.77024,
+      "grad_norm": 0.34586815567672,
+      "learning_rate": 2.6436846375660816e-05,
+      "loss": 0.5922,
+      "step": 2407
+    },
+    {
+      "epoch": 0.77056,
+      "grad_norm": 0.334961876383252,
+      "learning_rate": 2.6366676136768486e-05,
+      "loss": 0.6024,
+      "step": 2408
+    },
+    {
+      "epoch": 0.77088,
+      "grad_norm": 0.3791633433224865,
+      "learning_rate": 2.6296585002567252e-05,
+      "loss": 0.5835,
+      "step": 2409
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.349845300823539,
+      "learning_rate": 2.622657304835646e-05,
+      "loss": 0.5667,
+      "step": 2410
+    },
+    {
+      "epoch": 0.77152,
+      "grad_norm": 0.3551498303535744,
+      "learning_rate": 2.615664034935028e-05,
+      "loss": 0.5732,
+      "step": 2411
+    },
+    {
+      "epoch": 0.77184,
+      "grad_norm": 0.31879527794421286,
+      "learning_rate": 2.6086786980677837e-05,
+      "loss": 0.5355,
+      "step": 2412
+    },
+    {
+      "epoch": 0.77216,
+      "grad_norm": 0.3262907843763252,
+      "learning_rate": 2.601701301738303e-05,
+      "loss": 0.543,
+      "step": 2413
+    },
+    {
+      "epoch": 0.77248,
+      "grad_norm": 0.35623855490669876,
+      "learning_rate": 2.5947318534424346e-05,
+      "loss": 0.573,
+      "step": 2414
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.3767124025771466,
+      "learning_rate": 2.587770360667503e-05,
+      "loss": 0.5501,
+      "step": 2415
+    },
+    {
+      "epoch": 0.77312,
+      "grad_norm": 0.32759350047708313,
+      "learning_rate": 2.580816830892272e-05,
+      "loss": 0.5474,
+      "step": 2416
+    },
+    {
+      "epoch": 0.77344,
+      "grad_norm": 0.35442865582215455,
+      "learning_rate": 2.573871271586963e-05,
+      "loss": 0.5801,
+      "step": 2417
+    },
+    {
+      "epoch": 0.77376,
+      "grad_norm": 0.36348390933380514,
+      "learning_rate": 2.5669336902132234e-05,
+      "loss": 0.5636,
+      "step": 2418
+    },
+    {
+      "epoch": 0.77408,
+      "grad_norm": 0.3337592582572294,
+      "learning_rate": 2.560004094224143e-05,
+      "loss": 0.568,
+      "step": 2419
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.33280285788443853,
+      "learning_rate": 2.5530824910642183e-05,
+      "loss": 0.5749,
+      "step": 2420
+    },
+    {
+      "epoch": 0.77472,
+      "grad_norm": 0.3448406590938855,
+      "learning_rate": 2.5461688881693723e-05,
+      "loss": 0.5969,
+      "step": 2421
+    },
+    {
+      "epoch": 0.77504,
+      "grad_norm": 0.3515999885584849,
+      "learning_rate": 2.5392632929669213e-05,
+      "loss": 0.5886,
+      "step": 2422
+    },
+    {
+      "epoch": 0.77536,
+      "grad_norm": 0.33735108637060596,
+      "learning_rate": 2.5323657128755895e-05,
+      "loss": 0.5806,
+      "step": 2423
+    },
+    {
+      "epoch": 0.77568,
+      "grad_norm": 0.35121809997582326,
+      "learning_rate": 2.525476155305483e-05,
+      "loss": 0.6119,
+      "step": 2424
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.33504541137736965,
+      "learning_rate": 2.5185946276580918e-05,
+      "loss": 0.5855,
+      "step": 2425
+    },
+    {
+      "epoch": 0.77632,
+      "grad_norm": 0.342514163567369,
+      "learning_rate": 2.511721137326284e-05,
+      "loss": 0.5453,
+      "step": 2426
+    },
+    {
+      "epoch": 0.77664,
+      "grad_norm": 0.3469120002174987,
+      "learning_rate": 2.5048556916942824e-05,
+      "loss": 0.55,
+      "step": 2427
+    },
+    {
+      "epoch": 0.77696,
+      "grad_norm": 0.34586812850714027,
+      "learning_rate": 2.497998298137676e-05,
+      "loss": 0.5992,
+      "step": 2428
+    },
+    {
+      "epoch": 0.77728,
+      "grad_norm": 0.33724972199239284,
+      "learning_rate": 2.4911489640234055e-05,
+      "loss": 0.6001,
+      "step": 2429
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3512325189234438,
+      "learning_rate": 2.484307696709741e-05,
+      "loss": 0.5819,
+      "step": 2430
+    },
+    {
+      "epoch": 0.77792,
+      "grad_norm": 0.349654684211299,
+      "learning_rate": 2.4774745035463008e-05,
+      "loss": 0.6303,
+      "step": 2431
+    },
+    {
+      "epoch": 0.77824,
+      "grad_norm": 0.33453337928277854,
+      "learning_rate": 2.470649391874017e-05,
+      "loss": 0.5359,
+      "step": 2432
+    },
+    {
+      "epoch": 0.77856,
+      "grad_norm": 0.3416789705540942,
+      "learning_rate": 2.4638323690251486e-05,
+      "loss": 0.5783,
+      "step": 2433
+    },
+    {
+      "epoch": 0.77888,
+      "grad_norm": 0.34156817967059255,
+      "learning_rate": 2.457023442323262e-05,
+      "loss": 0.5715,
+      "step": 2434
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3368542957143385,
+      "learning_rate": 2.4502226190832222e-05,
+      "loss": 0.5813,
+      "step": 2435
+    },
+    {
+      "epoch": 0.77952,
+      "grad_norm": 0.327260707366308,
+      "learning_rate": 2.4434299066111953e-05,
+      "loss": 0.5449,
+      "step": 2436
+    },
+    {
+      "epoch": 0.77984,
+      "grad_norm": 0.3524152885666758,
+      "learning_rate": 2.4366453122046263e-05,
+      "loss": 0.6044,
+      "step": 2437
+    },
+    {
+      "epoch": 0.78016,
+      "grad_norm": 0.35477651008206396,
+      "learning_rate": 2.429868843152243e-05,
+      "loss": 0.5971,
+      "step": 2438
+    },
+    {
+      "epoch": 0.78048,
+      "grad_norm": 0.4093698158410259,
+      "learning_rate": 2.4231005067340508e-05,
+      "loss": 0.6219,
+      "step": 2439
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3528251215152049,
+      "learning_rate": 2.4163403102213012e-05,
+      "loss": 0.5619,
+      "step": 2440
+    },
+    {
+      "epoch": 0.78112,
+      "grad_norm": 0.35393343997845517,
+      "learning_rate": 2.4095882608765196e-05,
+      "loss": 0.5801,
+      "step": 2441
+    },
+    {
+      "epoch": 0.78144,
+      "grad_norm": 0.3434472851711151,
+      "learning_rate": 2.4028443659534638e-05,
+      "loss": 0.5451,
+      "step": 2442
+    },
+    {
+      "epoch": 0.78176,
+      "grad_norm": 0.4105730561393544,
+      "learning_rate": 2.396108632697145e-05,
+      "loss": 0.5907,
+      "step": 2443
+    },
+    {
+      "epoch": 0.78208,
+      "grad_norm": 0.3629371386000478,
+      "learning_rate": 2.38938106834379e-05,
+      "loss": 0.5737,
+      "step": 2444
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.33696633482943883,
+      "learning_rate": 2.382661680120868e-05,
+      "loss": 0.584,
+      "step": 2445
+    },
+    {
+      "epoch": 0.78272,
+      "grad_norm": 0.3449131704792935,
+      "learning_rate": 2.3759504752470463e-05,
+      "loss": 0.573,
+      "step": 2446
+    },
+    {
+      "epoch": 0.78304,
+      "grad_norm": 0.3727591769601757,
+      "learning_rate": 2.369247460932219e-05,
+      "loss": 0.6211,
+      "step": 2447
+    },
+    {
+      "epoch": 0.78336,
+      "grad_norm": 0.32671458489505556,
+      "learning_rate": 2.3625526443774636e-05,
+      "loss": 0.5821,
+      "step": 2448
+    },
+    {
+      "epoch": 0.78368,
+      "grad_norm": 0.354124806828755,
+      "learning_rate": 2.3558660327750647e-05,
+      "loss": 0.5517,
+      "step": 2449
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.34426795256547404,
+      "learning_rate": 2.349187633308483e-05,
+      "loss": 0.5745,
+      "step": 2450
+    },
+    {
+      "epoch": 0.78432,
+      "grad_norm": 0.3494513559440474,
+      "learning_rate": 2.3425174531523596e-05,
+      "loss": 0.6134,
+      "step": 2451
+    },
+    {
+      "epoch": 0.78464,
+      "grad_norm": 0.35765403671344015,
+      "learning_rate": 2.3358554994725123e-05,
+      "loss": 0.6041,
+      "step": 2452
+    },
+    {
+      "epoch": 0.78496,
+      "grad_norm": 0.35954927876655834,
+      "learning_rate": 2.329201779425909e-05,
+      "loss": 0.5512,
+      "step": 2453
+    },
+    {
+      "epoch": 0.78528,
+      "grad_norm": 0.37114122005555067,
+      "learning_rate": 2.322556300160682e-05,
+      "loss": 0.6187,
+      "step": 2454
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.351944774645019,
+      "learning_rate": 2.3159190688161038e-05,
+      "loss": 0.5522,
+      "step": 2455
+    },
+    {
+      "epoch": 0.78592,
+      "grad_norm": 0.39093401850285614,
+      "learning_rate": 2.3092900925225903e-05,
+      "loss": 0.6108,
+      "step": 2456
+    },
+    {
+      "epoch": 0.78624,
+      "grad_norm": 0.3630710819278059,
+      "learning_rate": 2.3026693784016896e-05,
+      "loss": 0.5769,
+      "step": 2457
+    },
+    {
+      "epoch": 0.78656,
+      "grad_norm": 0.3253875198388068,
+      "learning_rate": 2.2960569335660685e-05,
+      "loss": 0.5478,
+      "step": 2458
+    },
+    {
+      "epoch": 0.78688,
+      "grad_norm": 0.3906710897143039,
+      "learning_rate": 2.2894527651195152e-05,
+      "loss": 0.603,
+      "step": 2459
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3607411091633684,
+      "learning_rate": 2.2828568801569283e-05,
+      "loss": 0.5763,
+      "step": 2460
+    },
+    {
+      "epoch": 0.78752,
+      "grad_norm": 0.35173782387112157,
+      "learning_rate": 2.2762692857642963e-05,
+      "loss": 0.5981,
+      "step": 2461
+    },
+    {
+      "epoch": 0.78784,
+      "grad_norm": 0.339858955189516,
+      "learning_rate": 2.2696899890187162e-05,
+      "loss": 0.6017,
+      "step": 2462
+    },
+    {
+      "epoch": 0.78816,
+      "grad_norm": 0.34544866050885736,
+      "learning_rate": 2.263118996988357e-05,
+      "loss": 0.5363,
+      "step": 2463
+    },
+    {
+      "epoch": 0.78848,
+      "grad_norm": 0.34761095872013703,
+      "learning_rate": 2.2565563167324743e-05,
+      "loss": 0.5989,
+      "step": 2464
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.31530810134808945,
+      "learning_rate": 2.2500019553013964e-05,
+      "loss": 0.5473,
+      "step": 2465
+    },
+    {
+      "epoch": 0.78912,
+      "grad_norm": 0.33415143701761924,
+      "learning_rate": 2.2434559197365034e-05,
+      "loss": 0.5469,
+      "step": 2466
+    },
+    {
+      "epoch": 0.78944,
+      "grad_norm": 0.35692277498717956,
+      "learning_rate": 2.236918217070244e-05,
+      "loss": 0.5848,
+      "step": 2467
+    },
+    {
+      "epoch": 0.78976,
+      "grad_norm": 0.347516591358437,
+      "learning_rate": 2.2303888543261032e-05,
+      "loss": 0.5526,
+      "step": 2468
+    },
+    {
+      "epoch": 0.79008,
+      "grad_norm": 0.34620092675831654,
+      "learning_rate": 2.223867838518615e-05,
+      "loss": 0.5462,
+      "step": 2469
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.33557138741524,
+      "learning_rate": 2.217355176653345e-05,
+      "loss": 0.5257,
+      "step": 2470
+    },
+    {
+      "epoch": 0.79072,
+      "grad_norm": 0.3697668097801008,
+      "learning_rate": 2.21085087572688e-05,
+      "loss": 0.5585,
+      "step": 2471
+    },
+    {
+      "epoch": 0.79104,
+      "grad_norm": 0.3715956488757461,
+      "learning_rate": 2.204354942726824e-05,
+      "loss": 0.5508,
+      "step": 2472
+    },
+    {
+      "epoch": 0.79136,
+      "grad_norm": 0.3243701754045067,
+      "learning_rate": 2.1978673846318e-05,
+      "loss": 0.5759,
+      "step": 2473
+    },
+    {
+      "epoch": 0.79168,
+      "grad_norm": 0.36072046537223734,
+      "learning_rate": 2.191388208411421e-05,
+      "loss": 0.5758,
+      "step": 2474
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.33967224222953046,
+      "learning_rate": 2.184917421026309e-05,
+      "loss": 0.5881,
+      "step": 2475
+    },
+    {
+      "epoch": 0.79232,
+      "grad_norm": 0.3465350920768046,
+      "learning_rate": 2.1784550294280616e-05,
+      "loss": 0.615,
+      "step": 2476
+    },
+    {
+      "epoch": 0.79264,
+      "grad_norm": 0.37908477222437176,
+      "learning_rate": 2.172001040559264e-05,
+      "loss": 0.6177,
+      "step": 2477
+    },
+    {
+      "epoch": 0.79296,
+      "grad_norm": 0.3278710982429467,
+      "learning_rate": 2.1655554613534767e-05,
+      "loss": 0.5232,
+      "step": 2478
+    },
+    {
+      "epoch": 0.79328,
+      "grad_norm": 0.3384039255396264,
+      "learning_rate": 2.1591182987352142e-05,
+      "loss": 0.5578,
+      "step": 2479
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3403251723616923,
+      "learning_rate": 2.1526895596199626e-05,
+      "loss": 0.6037,
+      "step": 2480
+    },
+    {
+      "epoch": 0.79392,
+      "grad_norm": 0.3710591109731778,
+      "learning_rate": 2.1462692509141467e-05,
+      "loss": 0.6148,
+      "step": 2481
+    },
+    {
+      "epoch": 0.79424,
+      "grad_norm": 0.3470562337667696,
+      "learning_rate": 2.1398573795151432e-05,
+      "loss": 0.5675,
+      "step": 2482
+    },
+    {
+      "epoch": 0.79456,
+      "grad_norm": 0.3473910022037806,
+      "learning_rate": 2.133453952311264e-05,
+      "loss": 0.5619,
+      "step": 2483
+    },
+    {
+      "epoch": 0.79488,
+      "grad_norm": 0.3438608091876821,
+      "learning_rate": 2.1270589761817407e-05,
+      "loss": 0.6002,
+      "step": 2484
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3446990832392834,
+      "learning_rate": 2.1206724579967373e-05,
+      "loss": 0.5499,
+      "step": 2485
+    },
+    {
+      "epoch": 0.79552,
+      "grad_norm": 0.3545567847508976,
+      "learning_rate": 2.1142944046173207e-05,
+      "loss": 0.6121,
+      "step": 2486
+    },
+    {
+      "epoch": 0.79584,
+      "grad_norm": 0.3289303546556512,
+      "learning_rate": 2.1079248228954718e-05,
+      "loss": 0.5824,
+      "step": 2487
+    },
+    {
+      "epoch": 0.79616,
+      "grad_norm": 0.33370158019647944,
+      "learning_rate": 2.1015637196740712e-05,
+      "loss": 0.5954,
+      "step": 2488
+    },
+    {
+      "epoch": 0.79648,
+      "grad_norm": 0.33882023411268536,
+      "learning_rate": 2.0952111017868813e-05,
+      "loss": 0.5876,
+      "step": 2489
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.32884890388514093,
+      "learning_rate": 2.088866976058559e-05,
+      "loss": 0.554,
+      "step": 2490
+    },
+    {
+      "epoch": 0.79712,
+      "grad_norm": 0.33133134284141186,
+      "learning_rate": 2.082531349304636e-05,
+      "loss": 0.545,
+      "step": 2491
+    },
+    {
+      "epoch": 0.79744,
+      "grad_norm": 0.33783537894596144,
+      "learning_rate": 2.0762042283315052e-05,
+      "loss": 0.5587,
+      "step": 2492
+    },
+    {
+      "epoch": 0.79776,
+      "grad_norm": 0.3557898528807314,
+      "learning_rate": 2.0698856199364348e-05,
+      "loss": 0.5588,
+      "step": 2493
+    },
+    {
+      "epoch": 0.79808,
+      "grad_norm": 0.3500657192864167,
+      "learning_rate": 2.0635755309075343e-05,
+      "loss": 0.5911,
+      "step": 2494
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.3647386915647849,
+      "learning_rate": 2.0572739680237717e-05,
+      "loss": 0.599,
+      "step": 2495
+    },
+    {
+      "epoch": 0.79872,
+      "grad_norm": 0.3591643550741855,
+      "learning_rate": 2.0509809380549537e-05,
+      "loss": 0.5871,
+      "step": 2496
+    },
+    {
+      "epoch": 0.79904,
+      "grad_norm": 0.3517398474758034,
+      "learning_rate": 2.0446964477617116e-05,
+      "loss": 0.5943,
+      "step": 2497
+    },
+    {
+      "epoch": 0.79936,
+      "grad_norm": 0.3490242405490287,
+      "learning_rate": 2.0384205038955127e-05,
+      "loss": 0.5749,
+      "step": 2498
+    },
+    {
+      "epoch": 0.79968,
+      "grad_norm": 0.33831548102579173,
+      "learning_rate": 2.032153113198636e-05,
+      "loss": 0.5579,
+      "step": 2499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3634872835470982,
+      "learning_rate": 2.025894282404177e-05,
+      "loss": 0.612,
+      "step": 2500
+    },
+    {
+      "epoch": 0.80032,
+      "grad_norm": 0.3425232023996621,
+      "learning_rate": 2.019644018236029e-05,
+      "loss": 0.5536,
+      "step": 2501
+    },
+    {
+      "epoch": 0.80064,
+      "grad_norm": 0.3480662902670338,
+      "learning_rate": 2.0134023274088898e-05,
+      "loss": 0.5832,
+      "step": 2502
+    },
+    {
+      "epoch": 0.80096,
+      "grad_norm": 0.3329767821258931,
+      "learning_rate": 2.0071692166282384e-05,
+      "loss": 0.6013,
+      "step": 2503
+    },
+    {
+      "epoch": 0.80128,
+      "grad_norm": 0.33173035226101194,
+      "learning_rate": 2.0009446925903462e-05,
+      "loss": 0.5412,
+      "step": 2504
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.35892749701366916,
+      "learning_rate": 1.9947287619822474e-05,
+      "loss": 0.6004,
+      "step": 2505
+    },
+    {
+      "epoch": 0.80192,
+      "grad_norm": 0.32380929116089685,
+      "learning_rate": 1.9885214314817568e-05,
+      "loss": 0.5298,
+      "step": 2506
+    },
+    {
+      "epoch": 0.80224,
+      "grad_norm": 0.3648924820132169,
+      "learning_rate": 1.9823227077574392e-05,
+      "loss": 0.5264,
+      "step": 2507
+    },
+    {
+      "epoch": 0.80256,
+      "grad_norm": 0.3476892297982287,
+      "learning_rate": 1.9761325974686208e-05,
+      "loss": 0.5717,
+      "step": 2508
+    },
+    {
+      "epoch": 0.80288,
+      "grad_norm": 0.49034137620021845,
+      "learning_rate": 1.9699511072653733e-05,
+      "loss": 0.5594,
+      "step": 2509
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.33899973081415063,
+      "learning_rate": 1.9637782437885023e-05,
+      "loss": 0.5813,
+      "step": 2510
+    },
+    {
+      "epoch": 0.80352,
+      "grad_norm": 0.34594804939489354,
+      "learning_rate": 1.9576140136695542e-05,
+      "loss": 0.5848,
+      "step": 2511
+    },
+    {
+      "epoch": 0.80384,
+      "grad_norm": 0.35349589268410203,
+      "learning_rate": 1.951458423530791e-05,
+      "loss": 0.5972,
+      "step": 2512
+    },
+    {
+      "epoch": 0.80416,
+      "grad_norm": 0.34116101498955914,
+      "learning_rate": 1.945311479985199e-05,
+      "loss": 0.5527,
+      "step": 2513
+    },
+    {
+      "epoch": 0.80448,
+      "grad_norm": 0.3452356052708206,
+      "learning_rate": 1.9391731896364784e-05,
+      "loss": 0.5324,
+      "step": 2514
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3203982896423424,
+      "learning_rate": 1.933043559079022e-05,
+      "loss": 0.5573,
+      "step": 2515
+    },
+    {
+      "epoch": 0.80512,
+      "grad_norm": 0.3402648463973037,
+      "learning_rate": 1.926922594897932e-05,
+      "loss": 0.5038,
+      "step": 2516
+    },
+    {
+      "epoch": 0.80544,
+      "grad_norm": 0.34359612396387357,
+      "learning_rate": 1.9208103036689894e-05,
+      "loss": 0.5888,
+      "step": 2517
+    },
+    {
+      "epoch": 0.80576,
+      "grad_norm": 0.3442759319141374,
+      "learning_rate": 1.9147066919586644e-05,
+      "loss": 0.5831,
+      "step": 2518
+    },
+    {
+      "epoch": 0.80608,
+      "grad_norm": 0.3272406428240339,
+      "learning_rate": 1.9086117663241055e-05,
+      "loss": 0.5817,
+      "step": 2519
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3742487291696982,
+      "learning_rate": 1.9025255333131176e-05,
+      "loss": 0.6346,
+      "step": 2520
+    },
+    {
+      "epoch": 0.80672,
+      "grad_norm": 0.40930377904831494,
+      "learning_rate": 1.8964479994641805e-05,
+      "loss": 0.5793,
+      "step": 2521
+    },
+    {
+      "epoch": 0.80704,
+      "grad_norm": 0.3565577889922621,
+      "learning_rate": 1.8903791713064233e-05,
+      "loss": 0.5882,
+      "step": 2522
+    },
+    {
+      "epoch": 0.80736,
+      "grad_norm": 0.3530479024802253,
+      "learning_rate": 1.884319055359617e-05,
+      "loss": 0.5817,
+      "step": 2523
+    },
+    {
+      "epoch": 0.80768,
+      "grad_norm": 0.3724923440038362,
+      "learning_rate": 1.878267658134184e-05,
+      "loss": 0.6316,
+      "step": 2524
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.34200673881188254,
+      "learning_rate": 1.872224986131168e-05,
+      "loss": 0.5546,
+      "step": 2525
+    },
+    {
+      "epoch": 0.80832,
+      "grad_norm": 0.34782820641086815,
+      "learning_rate": 1.8661910458422514e-05,
+      "loss": 0.5609,
+      "step": 2526
+    },
+    {
+      "epoch": 0.80864,
+      "grad_norm": 0.3815838807460841,
+      "learning_rate": 1.860165843749725e-05,
+      "loss": 0.6105,
+      "step": 2527
+    },
+    {
+      "epoch": 0.80896,
+      "grad_norm": 0.32259838169392785,
+      "learning_rate": 1.8541493863265e-05,
+      "loss": 0.5511,
+      "step": 2528
+    },
+    {
+      "epoch": 0.80928,
+      "grad_norm": 0.3717571350885919,
+      "learning_rate": 1.8481416800360872e-05,
+      "loss": 0.5916,
+      "step": 2529
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.33874743755926234,
+      "learning_rate": 1.8421427313326046e-05,
+      "loss": 0.5645,
+      "step": 2530
+    },
+    {
+      "epoch": 0.80992,
+      "grad_norm": 0.32945992612330516,
+      "learning_rate": 1.8361525466607488e-05,
+      "loss": 0.5256,
+      "step": 2531
+    },
+    {
+      "epoch": 0.81024,
+      "grad_norm": 0.3408395461618973,
+      "learning_rate": 1.8301711324558158e-05,
+      "loss": 0.5444,
+      "step": 2532
+    },
+    {
+      "epoch": 0.81056,
+      "grad_norm": 0.3360345310477082,
+      "learning_rate": 1.8241984951436665e-05,
+      "loss": 0.5082,
+      "step": 2533
+    },
+    {
+      "epoch": 0.81088,
+      "grad_norm": 0.3619912782367946,
+      "learning_rate": 1.8182346411407412e-05,
+      "loss": 0.5975,
+      "step": 2534
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.36009948064504005,
+      "learning_rate": 1.8122795768540435e-05,
+      "loss": 0.566,
+      "step": 2535
+    },
+    {
+      "epoch": 0.81152,
+      "grad_norm": 0.3418557525512484,
+      "learning_rate": 1.8063333086811272e-05,
+      "loss": 0.5837,
+      "step": 2536
+    },
+    {
+      "epoch": 0.81184,
+      "grad_norm": 0.34353632916754195,
+      "learning_rate": 1.8003958430101085e-05,
+      "loss": 0.6053,
+      "step": 2537
+    },
+    {
+      "epoch": 0.81216,
+      "grad_norm": 0.3466534323420069,
+      "learning_rate": 1.7944671862196316e-05,
+      "loss": 0.569,
+      "step": 2538
+    },
+    {
+      "epoch": 0.81248,
+      "grad_norm": 0.33448332937251646,
+      "learning_rate": 1.7885473446788913e-05,
+      "loss": 0.5097,
+      "step": 2539
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.338864461741268,
+      "learning_rate": 1.7826363247476062e-05,
+      "loss": 0.5711,
+      "step": 2540
+    },
+    {
+      "epoch": 0.81312,
+      "grad_norm": 0.3961894658231154,
+      "learning_rate": 1.7767341327760155e-05,
+      "loss": 0.6439,
+      "step": 2541
+    },
+    {
+      "epoch": 0.81344,
+      "grad_norm": 0.3389672150998602,
+      "learning_rate": 1.7708407751048804e-05,
+      "loss": 0.5311,
+      "step": 2542
+    },
+    {
+      "epoch": 0.81376,
+      "grad_norm": 0.3304649549560958,
+      "learning_rate": 1.7649562580654632e-05,
+      "loss": 0.5521,
+      "step": 2543
+    },
+    {
+      "epoch": 0.81408,
+      "grad_norm": 0.31581192258434576,
+      "learning_rate": 1.7590805879795356e-05,
+      "loss": 0.554,
+      "step": 2544
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3598882048796307,
+      "learning_rate": 1.7532137711593665e-05,
+      "loss": 0.5951,
+      "step": 2545
+    },
+    {
+      "epoch": 0.81472,
+      "grad_norm": 0.36921175003301293,
+      "learning_rate": 1.747355813907704e-05,
+      "loss": 0.6196,
+      "step": 2546
+    },
+    {
+      "epoch": 0.81504,
+      "grad_norm": 0.3247460409488971,
+      "learning_rate": 1.7415067225177893e-05,
+      "loss": 0.5536,
+      "step": 2547
+    },
+    {
+      "epoch": 0.81536,
+      "grad_norm": 0.34154135482010906,
+      "learning_rate": 1.73566650327333e-05,
+      "loss": 0.5606,
+      "step": 2548
+    },
+    {
+      "epoch": 0.81568,
+      "grad_norm": 0.3584651750407313,
+      "learning_rate": 1.7298351624485065e-05,
+      "loss": 0.5707,
+      "step": 2549
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3517416392835679,
+      "learning_rate": 1.724012706307966e-05,
+      "loss": 0.5829,
+      "step": 2550
+    },
+    {
+      "epoch": 0.81632,
+      "grad_norm": 0.32107824931031526,
+      "learning_rate": 1.7181991411067987e-05,
+      "loss": 0.5364,
+      "step": 2551
+    },
+    {
+      "epoch": 0.81664,
+      "grad_norm": 0.319947206131299,
+      "learning_rate": 1.712394473090555e-05,
+      "loss": 0.5448,
+      "step": 2552
+    },
+    {
+      "epoch": 0.81696,
+      "grad_norm": 0.3446025975101544,
+      "learning_rate": 1.7065987084952217e-05,
+      "loss": 0.5848,
+      "step": 2553
+    },
+    {
+      "epoch": 0.81728,
+      "grad_norm": 0.357778438430661,
+      "learning_rate": 1.7008118535472196e-05,
+      "loss": 0.5725,
+      "step": 2554
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.3549516723717266,
+      "learning_rate": 1.6950339144633975e-05,
+      "loss": 0.589,
+      "step": 2555
+    },
+    {
+      "epoch": 0.81792,
+      "grad_norm": 0.3535820407730016,
+      "learning_rate": 1.6892648974510328e-05,
+      "loss": 0.5997,
+      "step": 2556
+    },
+    {
+      "epoch": 0.81824,
+      "grad_norm": 0.3443859652005636,
+      "learning_rate": 1.6835048087078075e-05,
+      "loss": 0.5959,
+      "step": 2557
+    },
+    {
+      "epoch": 0.81856,
+      "grad_norm": 0.35135030865264794,
+      "learning_rate": 1.677753654421821e-05,
+      "loss": 0.5563,
+      "step": 2558
+    },
+    {
+      "epoch": 0.81888,
+      "grad_norm": 0.3517772511502157,
+      "learning_rate": 1.6720114407715658e-05,
+      "loss": 0.583,
+      "step": 2559
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.37629848204855904,
+      "learning_rate": 1.6662781739259403e-05,
+      "loss": 0.5675,
+      "step": 2560
+    },
+    {
+      "epoch": 0.81952,
+      "grad_norm": 0.3329615188076578,
+      "learning_rate": 1.6605538600442194e-05,
+      "loss": 0.5936,
+      "step": 2561
+    },
+    {
+      "epoch": 0.81984,
+      "grad_norm": 0.34608915327448414,
+      "learning_rate": 1.6548385052760674e-05,
+      "loss": 0.5779,
+      "step": 2562
+    },
+    {
+      "epoch": 0.82016,
+      "grad_norm": 0.38482917890870877,
+      "learning_rate": 1.6491321157615257e-05,
+      "loss": 0.539,
+      "step": 2563
+    },
+    {
+      "epoch": 0.82048,
+      "grad_norm": 0.354495920622266,
+      "learning_rate": 1.6434346976309943e-05,
+      "loss": 0.5032,
+      "step": 2564
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.3342699345517741,
+      "learning_rate": 1.6377462570052438e-05,
+      "loss": 0.5739,
+      "step": 2565
+    },
+    {
+      "epoch": 0.82112,
+      "grad_norm": 0.3476272880919875,
+      "learning_rate": 1.632066799995401e-05,
+      "loss": 0.5887,
+      "step": 2566
+    },
+    {
+      "epoch": 0.82144,
+      "grad_norm": 0.47916521297807313,
+      "learning_rate": 1.626396332702933e-05,
+      "loss": 0.5443,
+      "step": 2567
+    },
+    {
+      "epoch": 0.82176,
+      "grad_norm": 0.3178166462315333,
+      "learning_rate": 1.620734861219658e-05,
+      "loss": 0.5439,
+      "step": 2568
+    },
+    {
+      "epoch": 0.82208,
+      "grad_norm": 0.34288874484912857,
+      "learning_rate": 1.6150823916277248e-05,
+      "loss": 0.5582,
+      "step": 2569
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.41067229081441986,
+      "learning_rate": 1.6094389299996125e-05,
+      "loss": 0.6094,
+      "step": 2570
+    },
+    {
+      "epoch": 0.82272,
+      "grad_norm": 0.3618375438025809,
+      "learning_rate": 1.603804482398127e-05,
+      "loss": 0.6114,
+      "step": 2571
+    },
+    {
+      "epoch": 0.82304,
+      "grad_norm": 0.3790824543863109,
+      "learning_rate": 1.598179054876382e-05,
+      "loss": 0.6148,
+      "step": 2572
+    },
+    {
+      "epoch": 0.82336,
+      "grad_norm": 0.3485899110997312,
+      "learning_rate": 1.5925626534778103e-05,
+      "loss": 0.557,
+      "step": 2573
+    },
+    {
+      "epoch": 0.82368,
+      "grad_norm": 0.3514882907126632,
+      "learning_rate": 1.5869552842361378e-05,
+      "loss": 0.5372,
+      "step": 2574
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3364951753060846,
+      "learning_rate": 1.5813569531753968e-05,
+      "loss": 0.605,
+      "step": 2575
+    },
+    {
+      "epoch": 0.82432,
+      "grad_norm": 0.3684388543836776,
+      "learning_rate": 1.5757676663099076e-05,
+      "loss": 0.6105,
+      "step": 2576
+    },
+    {
+      "epoch": 0.82464,
+      "grad_norm": 0.3321301381734369,
+      "learning_rate": 1.5701874296442665e-05,
+      "loss": 0.5641,
+      "step": 2577
+    },
+    {
+      "epoch": 0.82496,
+      "grad_norm": 0.33324934501204545,
+      "learning_rate": 1.564616249173355e-05,
+      "loss": 0.5505,
+      "step": 2578
+    },
+    {
+      "epoch": 0.82528,
+      "grad_norm": 0.3543134291092468,
+      "learning_rate": 1.559054130882327e-05,
+      "loss": 0.529,
+      "step": 2579
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3581079442657288,
+      "learning_rate": 1.553501080746592e-05,
+      "loss": 0.5957,
+      "step": 2580
+    },
+    {
+      "epoch": 0.82592,
+      "grad_norm": 0.34821991077229714,
+      "learning_rate": 1.5479571047318287e-05,
+      "loss": 0.5619,
+      "step": 2581
+    },
+    {
+      "epoch": 0.82624,
+      "grad_norm": 0.3446841489137521,
+      "learning_rate": 1.5424222087939544e-05,
+      "loss": 0.5579,
+      "step": 2582
+    },
+    {
+      "epoch": 0.82656,
+      "grad_norm": 0.3512213595332296,
+      "learning_rate": 1.5368963988791453e-05,
+      "loss": 0.548,
+      "step": 2583
+    },
+    {
+      "epoch": 0.82688,
+      "grad_norm": 0.35407948504044434,
+      "learning_rate": 1.5313796809238057e-05,
+      "loss": 0.5291,
+      "step": 2584
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.35336483465835566,
+      "learning_rate": 1.5258720608545762e-05,
+      "loss": 0.6354,
+      "step": 2585
+    },
+    {
+      "epoch": 0.82752,
+      "grad_norm": 0.35100577830346985,
+      "learning_rate": 1.5203735445883282e-05,
+      "loss": 0.5833,
+      "step": 2586
+    },
+    {
+      "epoch": 0.82784,
+      "grad_norm": 0.37158239057050696,
+      "learning_rate": 1.514884138032142e-05,
+      "loss": 0.5611,
+      "step": 2587
+    },
+    {
+      "epoch": 0.82816,
+      "grad_norm": 0.3591429055911957,
+      "learning_rate": 1.5094038470833217e-05,
+      "loss": 0.6189,
+      "step": 2588
+    },
+    {
+      "epoch": 0.82848,
+      "grad_norm": 0.3624361997159548,
+      "learning_rate": 1.5039326776293771e-05,
+      "loss": 0.6094,
+      "step": 2589
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.348654141636165,
+      "learning_rate": 1.4984706355480094e-05,
+      "loss": 0.6149,
+      "step": 2590
+    },
+    {
+      "epoch": 0.82912,
+      "grad_norm": 0.33611335623002886,
+      "learning_rate": 1.4930177267071277e-05,
+      "loss": 0.5616,
+      "step": 2591
+    },
+    {
+      "epoch": 0.82944,
+      "grad_norm": 0.546330809890968,
+      "learning_rate": 1.4875739569648172e-05,
+      "loss": 0.5911,
+      "step": 2592
+    },
+    {
+      "epoch": 0.82976,
+      "grad_norm": 0.3929949240286303,
+      "learning_rate": 1.4821393321693523e-05,
+      "loss": 0.5749,
+      "step": 2593
+    },
+    {
+      "epoch": 0.83008,
+      "grad_norm": 0.36194757328710503,
+      "learning_rate": 1.4767138581591822e-05,
+      "loss": 0.5784,
+      "step": 2594
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3404384657653635,
+      "learning_rate": 1.4712975407629203e-05,
+      "loss": 0.5713,
+      "step": 2595
+    },
+    {
+      "epoch": 0.83072,
+      "grad_norm": 0.33359288488877453,
+      "learning_rate": 1.4658903857993489e-05,
+      "loss": 0.5369,
+      "step": 2596
+    },
+    {
+      "epoch": 0.83104,
+      "grad_norm": 0.33777074819030184,
+      "learning_rate": 1.4604923990774067e-05,
+      "loss": 0.5617,
+      "step": 2597
+    },
+    {
+      "epoch": 0.83136,
+      "grad_norm": 0.4123918560404225,
+      "learning_rate": 1.455103586396177e-05,
+      "loss": 0.6266,
+      "step": 2598
+    },
+    {
+      "epoch": 0.83168,
+      "grad_norm": 0.3408086516335801,
+      "learning_rate": 1.449723953544896e-05,
+      "loss": 0.5591,
+      "step": 2599
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.33205955167379664,
+      "learning_rate": 1.4443535063029279e-05,
+      "loss": 0.5408,
+      "step": 2600
+    },
+    {
+      "epoch": 0.83232,
+      "grad_norm": 0.35731959101580363,
+      "learning_rate": 1.4389922504397769e-05,
+      "loss": 0.5763,
+      "step": 2601
+    },
+    {
+      "epoch": 0.83264,
+      "grad_norm": 0.3351865605996006,
+      "learning_rate": 1.433640191715072e-05,
+      "loss": 0.5408,
+      "step": 2602
+    },
+    {
+      "epoch": 0.83296,
+      "grad_norm": 0.34730184802784503,
+      "learning_rate": 1.4282973358785557e-05,
+      "loss": 0.586,
+      "step": 2603
+    },
+    {
+      "epoch": 0.83328,
+      "grad_norm": 0.3669686168935996,
+      "learning_rate": 1.4229636886700925e-05,
+      "loss": 0.5721,
+      "step": 2604
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.33472540609377066,
+      "learning_rate": 1.417639255819645e-05,
+      "loss": 0.587,
+      "step": 2605
+    },
+    {
+      "epoch": 0.83392,
+      "grad_norm": 0.37710979799483263,
+      "learning_rate": 1.4123240430472828e-05,
+      "loss": 0.5949,
+      "step": 2606
+    },
+    {
+      "epoch": 0.83424,
+      "grad_norm": 0.3253109261838384,
+      "learning_rate": 1.4070180560631707e-05,
+      "loss": 0.5777,
+      "step": 2607
+    },
+    {
+      "epoch": 0.83456,
+      "grad_norm": 0.34239123450888104,
+      "learning_rate": 1.4017213005675567e-05,
+      "loss": 0.5544,
+      "step": 2608
+    },
+    {
+      "epoch": 0.83488,
+      "grad_norm": 0.33994658984642845,
+      "learning_rate": 1.396433782250779e-05,
+      "loss": 0.6011,
+      "step": 2609
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3441195074068338,
+      "learning_rate": 1.3911555067932425e-05,
+      "loss": 0.5852,
+      "step": 2610
+    },
+    {
+      "epoch": 0.83552,
+      "grad_norm": 0.3555807272291283,
+      "learning_rate": 1.3858864798654347e-05,
+      "loss": 0.5839,
+      "step": 2611
+    },
+    {
+      "epoch": 0.83584,
+      "grad_norm": 0.36325441830243516,
+      "learning_rate": 1.3806267071278934e-05,
+      "loss": 0.5695,
+      "step": 2612
+    },
+    {
+      "epoch": 0.83616,
+      "grad_norm": 0.3953561428405037,
+      "learning_rate": 1.3753761942312294e-05,
+      "loss": 0.6256,
+      "step": 2613
+    },
+    {
+      "epoch": 0.83648,
+      "grad_norm": 0.3652391929111533,
+      "learning_rate": 1.3701349468160906e-05,
+      "loss": 0.6038,
+      "step": 2614
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3470206100630685,
+      "learning_rate": 1.3649029705131844e-05,
+      "loss": 0.5945,
+      "step": 2615
+    },
+    {
+      "epoch": 0.83712,
+      "grad_norm": 0.341386609067006,
+      "learning_rate": 1.3596802709432466e-05,
+      "loss": 0.5598,
+      "step": 2616
+    },
+    {
+      "epoch": 0.83744,
+      "grad_norm": 0.31995790343333264,
+      "learning_rate": 1.354466853717059e-05,
+      "loss": 0.5911,
+      "step": 2617
+    },
+    {
+      "epoch": 0.83776,
+      "grad_norm": 0.33143148180840015,
+      "learning_rate": 1.3492627244354195e-05,
+      "loss": 0.5413,
+      "step": 2618
+    },
+    {
+      "epoch": 0.83808,
+      "grad_norm": 0.34003158625988406,
+      "learning_rate": 1.3440678886891544e-05,
+      "loss": 0.5552,
+      "step": 2619
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3340954964269716,
+      "learning_rate": 1.3388823520591077e-05,
+      "loss": 0.5752,
+      "step": 2620
+    },
+    {
+      "epoch": 0.83872,
+      "grad_norm": 0.3412906441356953,
+      "learning_rate": 1.333706120116126e-05,
+      "loss": 0.5518,
+      "step": 2621
+    },
+    {
+      "epoch": 0.83904,
+      "grad_norm": 0.518691759525488,
+      "learning_rate": 1.3285391984210694e-05,
+      "loss": 0.5325,
+      "step": 2622
+    },
+    {
+      "epoch": 0.83936,
+      "grad_norm": 0.34948417036987467,
+      "learning_rate": 1.3233815925247839e-05,
+      "loss": 0.5793,
+      "step": 2623
+    },
+    {
+      "epoch": 0.83968,
+      "grad_norm": 0.33387704714373895,
+      "learning_rate": 1.3182333079681197e-05,
+      "loss": 0.5559,
+      "step": 2624
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3478150704785004,
+      "learning_rate": 1.3130943502819082e-05,
+      "loss": 0.587,
+      "step": 2625
+    },
+    {
+      "epoch": 0.84032,
+      "grad_norm": 0.34808433684798074,
+      "learning_rate": 1.3079647249869554e-05,
+      "loss": 0.5573,
+      "step": 2626
+    },
+    {
+      "epoch": 0.84064,
+      "grad_norm": 0.3555303762744616,
+      "learning_rate": 1.3028444375940496e-05,
+      "loss": 0.5422,
+      "step": 2627
+    },
+    {
+      "epoch": 0.84096,
+      "grad_norm": 0.36693657979142713,
+      "learning_rate": 1.2977334936039454e-05,
+      "loss": 0.5699,
+      "step": 2628
+    },
+    {
+      "epoch": 0.84128,
+      "grad_norm": 0.3888044978014789,
+      "learning_rate": 1.292631898507356e-05,
+      "loss": 0.598,
+      "step": 2629
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.35891004519502656,
+      "learning_rate": 1.2875396577849552e-05,
+      "loss": 0.5889,
+      "step": 2630
+    },
+    {
+      "epoch": 0.84192,
+      "grad_norm": 0.36315079256525856,
+      "learning_rate": 1.2824567769073636e-05,
+      "loss": 0.5662,
+      "step": 2631
+    },
+    {
+      "epoch": 0.84224,
+      "grad_norm": 0.3586950693191108,
+      "learning_rate": 1.27738326133515e-05,
+      "loss": 0.5605,
+      "step": 2632
+    },
+    {
+      "epoch": 0.84256,
+      "grad_norm": 0.3704429464120191,
+      "learning_rate": 1.2723191165188219e-05,
+      "loss": 0.555,
+      "step": 2633
+    },
+    {
+      "epoch": 0.84288,
+      "grad_norm": 0.35981620667429703,
+      "learning_rate": 1.2672643478988144e-05,
+      "loss": 0.5897,
+      "step": 2634
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.37174072493699917,
+      "learning_rate": 1.2622189609054979e-05,
+      "loss": 0.5559,
+      "step": 2635
+    },
+    {
+      "epoch": 0.84352,
+      "grad_norm": 0.3366993256397614,
+      "learning_rate": 1.2571829609591568e-05,
+      "loss": 0.5548,
+      "step": 2636
+    },
+    {
+      "epoch": 0.84384,
+      "grad_norm": 0.34727323293654794,
+      "learning_rate": 1.2521563534699964e-05,
+      "loss": 0.5758,
+      "step": 2637
+    },
+    {
+      "epoch": 0.84416,
+      "grad_norm": 0.3275944921982183,
+      "learning_rate": 1.2471391438381264e-05,
+      "loss": 0.5549,
+      "step": 2638
+    },
+    {
+      "epoch": 0.84448,
+      "grad_norm": 0.3639262908404435,
+      "learning_rate": 1.242131337453567e-05,
+      "loss": 0.5556,
+      "step": 2639
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.35231147612482977,
+      "learning_rate": 1.2371329396962273e-05,
+      "loss": 0.5532,
+      "step": 2640
+    },
+    {
+      "epoch": 0.84512,
+      "grad_norm": 0.3830053274716528,
+      "learning_rate": 1.2321439559359193e-05,
+      "loss": 0.6275,
+      "step": 2641
+    },
+    {
+      "epoch": 0.84544,
+      "grad_norm": 0.31845931261951443,
+      "learning_rate": 1.2271643915323317e-05,
+      "loss": 0.5472,
+      "step": 2642
+    },
+    {
+      "epoch": 0.84576,
+      "grad_norm": 0.3427011213998191,
+      "learning_rate": 1.2221942518350415e-05,
+      "loss": 0.5658,
+      "step": 2643
+    },
+    {
+      "epoch": 0.84608,
+      "grad_norm": 0.3513626484067751,
+      "learning_rate": 1.2172335421834957e-05,
+      "loss": 0.5861,
+      "step": 2644
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.35135266995205605,
+      "learning_rate": 1.2122822679070122e-05,
+      "loss": 0.6017,
+      "step": 2645
+    },
+    {
+      "epoch": 0.84672,
+      "grad_norm": 0.33436155632830905,
+      "learning_rate": 1.2073404343247752e-05,
+      "loss": 0.5248,
+      "step": 2646
+    },
+    {
+      "epoch": 0.84704,
+      "grad_norm": 0.3374426104451427,
+      "learning_rate": 1.202408046745821e-05,
+      "loss": 0.5784,
+      "step": 2647
+    },
+    {
+      "epoch": 0.84736,
+      "grad_norm": 0.3453557098555006,
+      "learning_rate": 1.1974851104690444e-05,
+      "loss": 0.5397,
+      "step": 2648
+    },
+    {
+      "epoch": 0.84768,
+      "grad_norm": 0.3625372888969604,
+      "learning_rate": 1.192571630783179e-05,
+      "loss": 0.5768,
+      "step": 2649
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3459052337748261,
+      "learning_rate": 1.1876676129668075e-05,
+      "loss": 0.5807,
+      "step": 2650
+    },
+    {
+      "epoch": 0.84832,
+      "grad_norm": 0.40475236982731055,
+      "learning_rate": 1.1827730622883425e-05,
+      "loss": 0.5746,
+      "step": 2651
+    },
+    {
+      "epoch": 0.84864,
+      "grad_norm": 0.34425907542118983,
+      "learning_rate": 1.1778879840060253e-05,
+      "loss": 0.5915,
+      "step": 2652
+    },
+    {
+      "epoch": 0.84896,
+      "grad_norm": 0.5933119668400415,
+      "learning_rate": 1.173012383367923e-05,
+      "loss": 0.576,
+      "step": 2653
+    },
+    {
+      "epoch": 0.84928,
+      "grad_norm": 0.3331184263117235,
+      "learning_rate": 1.1681462656119257e-05,
+      "loss": 0.5637,
+      "step": 2654
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.37719069707307695,
+      "learning_rate": 1.163289635965723e-05,
+      "loss": 0.558,
+      "step": 2655
+    },
+    {
+      "epoch": 0.84992,
+      "grad_norm": 0.37043432070547594,
+      "learning_rate": 1.1584424996468268e-05,
+      "loss": 0.6001,
+      "step": 2656
+    },
+    {
+      "epoch": 0.85024,
+      "grad_norm": 0.36164734024045697,
+      "learning_rate": 1.1536048618625362e-05,
+      "loss": 0.6163,
+      "step": 2657
+    },
+    {
+      "epoch": 0.85056,
+      "grad_norm": 0.34034283702173906,
+      "learning_rate": 1.148776727809956e-05,
+      "loss": 0.552,
+      "step": 2658
+    },
+    {
+      "epoch": 0.85088,
+      "grad_norm": 0.38567877856658006,
+      "learning_rate": 1.1439581026759783e-05,
+      "loss": 0.6218,
+      "step": 2659
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3591955610438624,
+      "learning_rate": 1.1391489916372766e-05,
+      "loss": 0.579,
+      "step": 2660
+    },
+    {
+      "epoch": 0.85152,
+      "grad_norm": 0.33245191012461534,
+      "learning_rate": 1.1343493998603083e-05,
+      "loss": 0.5753,
+      "step": 2661
+    },
+    {
+      "epoch": 0.85184,
+      "grad_norm": 0.47976499849916715,
+      "learning_rate": 1.1295593325012988e-05,
+      "loss": 0.5957,
+      "step": 2662
+    },
+    {
+      "epoch": 0.85216,
+      "grad_norm": 0.35215599710990786,
+      "learning_rate": 1.124778794706245e-05,
+      "loss": 0.5688,
+      "step": 2663
+    },
+    {
+      "epoch": 0.85248,
+      "grad_norm": 0.40121587600172964,
+      "learning_rate": 1.1200077916109075e-05,
+      "loss": 0.5883,
+      "step": 2664
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.37006485265448724,
+      "learning_rate": 1.1152463283407987e-05,
+      "loss": 0.5781,
+      "step": 2665
+    },
+    {
+      "epoch": 0.85312,
+      "grad_norm": 0.3712484528133468,
+      "learning_rate": 1.1104944100111891e-05,
+      "loss": 0.5832,
+      "step": 2666
+    },
+    {
+      "epoch": 0.85344,
+      "grad_norm": 0.3470463640747448,
+      "learning_rate": 1.1057520417270873e-05,
+      "loss": 0.5591,
+      "step": 2667
+    },
+    {
+      "epoch": 0.85376,
+      "grad_norm": 0.35187055682418883,
+      "learning_rate": 1.1010192285832466e-05,
+      "loss": 0.5796,
+      "step": 2668
+    },
+    {
+      "epoch": 0.85408,
+      "grad_norm": 0.363763686915762,
+      "learning_rate": 1.0962959756641566e-05,
+      "loss": 0.6015,
+      "step": 2669
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.35589410689397755,
+      "learning_rate": 1.0915822880440308e-05,
+      "loss": 0.6153,
+      "step": 2670
+    },
+    {
+      "epoch": 0.85472,
+      "grad_norm": 0.3686280230439948,
+      "learning_rate": 1.0868781707868126e-05,
+      "loss": 0.5925,
+      "step": 2671
+    },
+    {
+      "epoch": 0.85504,
+      "grad_norm": 0.333496968547741,
+      "learning_rate": 1.0821836289461628e-05,
+      "loss": 0.5258,
+      "step": 2672
+    },
+    {
+      "epoch": 0.85536,
+      "grad_norm": 0.3534845869134167,
+      "learning_rate": 1.0774986675654509e-05,
+      "loss": 0.6064,
+      "step": 2673
+    },
+    {
+      "epoch": 0.85568,
+      "grad_norm": 0.3285824331187314,
+      "learning_rate": 1.0728232916777604e-05,
+      "loss": 0.5045,
+      "step": 2674
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4422402636765624,
+      "learning_rate": 1.068157506305869e-05,
+      "loss": 0.5988,
+      "step": 2675
+    },
+    {
+      "epoch": 0.85632,
+      "grad_norm": 0.3277821779984777,
+      "learning_rate": 1.0635013164622598e-05,
+      "loss": 0.5255,
+      "step": 2676
+    },
+    {
+      "epoch": 0.85664,
+      "grad_norm": 0.3527888405250758,
+      "learning_rate": 1.0588547271491033e-05,
+      "loss": 0.564,
+      "step": 2677
+    },
+    {
+      "epoch": 0.85696,
+      "grad_norm": 0.31995112530676284,
+      "learning_rate": 1.0542177433582545e-05,
+      "loss": 0.5438,
+      "step": 2678
+    },
+    {
+      "epoch": 0.85728,
+      "grad_norm": 0.3721048764363539,
+      "learning_rate": 1.049590370071254e-05,
+      "loss": 0.6162,
+      "step": 2679
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.36379566636130883,
+      "learning_rate": 1.0449726122593107e-05,
+      "loss": 0.5804,
+      "step": 2680
+    },
+    {
+      "epoch": 0.85792,
+      "grad_norm": 0.3694160420463327,
+      "learning_rate": 1.0403644748833097e-05,
+      "loss": 0.5664,
+      "step": 2681
+    },
+    {
+      "epoch": 0.85824,
+      "grad_norm": 0.3438838456015583,
+      "learning_rate": 1.035765962893801e-05,
+      "loss": 0.5638,
+      "step": 2682
+    },
+    {
+      "epoch": 0.85856,
+      "grad_norm": 0.3407604631279433,
+      "learning_rate": 1.0311770812309873e-05,
+      "loss": 0.597,
+      "step": 2683
+    },
+    {
+      "epoch": 0.85888,
+      "grad_norm": 0.34980604356141215,
+      "learning_rate": 1.0265978348247318e-05,
+      "loss": 0.5294,
+      "step": 2684
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.33074675296681794,
+      "learning_rate": 1.022028228594547e-05,
+      "loss": 0.5592,
+      "step": 2685
+    },
+    {
+      "epoch": 0.85952,
+      "grad_norm": 0.35646341842882845,
+      "learning_rate": 1.0174682674495827e-05,
+      "loss": 0.5884,
+      "step": 2686
+    },
+    {
+      "epoch": 0.85984,
+      "grad_norm": 0.3327559283842682,
+      "learning_rate": 1.0129179562886327e-05,
+      "loss": 0.591,
+      "step": 2687
+    },
+    {
+      "epoch": 0.86016,
+      "grad_norm": 0.3562247424285518,
+      "learning_rate": 1.0083773000001207e-05,
+      "loss": 0.5409,
+      "step": 2688
+    },
+    {
+      "epoch": 0.86048,
+      "grad_norm": 0.34271633882254676,
+      "learning_rate": 1.0038463034620982e-05,
+      "loss": 0.5602,
+      "step": 2689
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.45829064243611684,
+      "learning_rate": 9.993249715422437e-06,
+      "loss": 0.5338,
+      "step": 2690
+    },
+    {
+      "epoch": 0.86112,
+      "grad_norm": 0.3571020177254091,
+      "learning_rate": 9.94813309097844e-06,
+      "loss": 0.5799,
+      "step": 2691
+    },
+    {
+      "epoch": 0.86144,
+      "grad_norm": 0.7690030167190615,
+      "learning_rate": 9.903113209758096e-06,
+      "loss": 0.5733,
+      "step": 2692
+    },
+    {
+      "epoch": 0.86176,
+      "grad_norm": 0.36397672579443835,
+      "learning_rate": 9.858190120126454e-06,
+      "loss": 0.5551,
+      "step": 2693
+    },
+    {
+      "epoch": 0.86208,
+      "grad_norm": 0.33307799914059033,
+      "learning_rate": 9.813363870344683e-06,
+      "loss": 0.59,
+      "step": 2694
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.35105460157092433,
+      "learning_rate": 9.76863450856984e-06,
+      "loss": 0.5773,
+      "step": 2695
+    },
+    {
+      "epoch": 0.86272,
+      "grad_norm": 0.344957999458551,
+      "learning_rate": 9.724002082854977e-06,
+      "loss": 0.5622,
+      "step": 2696
+    },
+    {
+      "epoch": 0.86304,
+      "grad_norm": 0.36117553927246615,
+      "learning_rate": 9.679466641148916e-06,
+      "loss": 0.6403,
+      "step": 2697
+    },
+    {
+      "epoch": 0.86336,
+      "grad_norm": 0.34524868159182515,
+      "learning_rate": 9.635028231296327e-06,
+      "loss": 0.5476,
+      "step": 2698
+    },
+    {
+      "epoch": 0.86368,
+      "grad_norm": 0.362278650943625,
+      "learning_rate": 9.590686901037648e-06,
+      "loss": 0.5699,
+      "step": 2699
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.34422212501217686,
+      "learning_rate": 9.546442698009061e-06,
+      "loss": 0.5619,
+      "step": 2700
+    },
+    {
+      "epoch": 0.86432,
+      "grad_norm": 0.42291120569078267,
+      "learning_rate": 9.502295669742289e-06,
+      "loss": 0.551,
+      "step": 2701
+    },
+    {
+      "epoch": 0.86464,
+      "grad_norm": 0.35497718610586765,
+      "learning_rate": 9.458245863664783e-06,
+      "loss": 0.5773,
+      "step": 2702
+    },
+    {
+      "epoch": 0.86496,
+      "grad_norm": 0.3589322565167151,
+      "learning_rate": 9.414293327099489e-06,
+      "loss": 0.6141,
+      "step": 2703
+    },
+    {
+      "epoch": 0.86528,
+      "grad_norm": 0.3579489197133505,
+      "learning_rate": 9.370438107264846e-06,
+      "loss": 0.5811,
+      "step": 2704
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.3361526389029414,
+      "learning_rate": 9.326680251274777e-06,
+      "loss": 0.5775,
+      "step": 2705
+    },
+    {
+      "epoch": 0.86592,
+      "grad_norm": 0.3620336094950284,
+      "learning_rate": 9.283019806138582e-06,
+      "loss": 0.6233,
+      "step": 2706
+    },
+    {
+      "epoch": 0.86624,
+      "grad_norm": 0.35552574324499514,
+      "learning_rate": 9.239456818760905e-06,
+      "loss": 0.604,
+      "step": 2707
+    },
+    {
+      "epoch": 0.86656,
+      "grad_norm": 0.3626207169312369,
+      "learning_rate": 9.195991335941756e-06,
+      "loss": 0.5535,
+      "step": 2708
+    },
+    {
+      "epoch": 0.86688,
+      "grad_norm": 0.36165536414025484,
+      "learning_rate": 9.152623404376293e-06,
+      "loss": 0.5794,
+      "step": 2709
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.34087441439371674,
+      "learning_rate": 9.10935307065497e-06,
+      "loss": 0.4998,
+      "step": 2710
+    },
+    {
+      "epoch": 0.86752,
+      "grad_norm": 0.32784235444910514,
+      "learning_rate": 9.0661803812633e-06,
+      "loss": 0.4953,
+      "step": 2711
+    },
+    {
+      "epoch": 0.86784,
+      "grad_norm": 0.33921486197812983,
+      "learning_rate": 9.023105382581975e-06,
+      "loss": 0.5409,
+      "step": 2712
+    },
+    {
+      "epoch": 0.86816,
+      "grad_norm": 0.9129041946953053,
+      "learning_rate": 8.980128120886722e-06,
+      "loss": 0.5119,
+      "step": 2713
+    },
+    {
+      "epoch": 0.86848,
+      "grad_norm": 0.38430006608115663,
+      "learning_rate": 8.937248642348218e-06,
+      "loss": 0.5736,
+      "step": 2714
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.35065551886106366,
+      "learning_rate": 8.894466993032147e-06,
+      "loss": 0.5856,
+      "step": 2715
+    },
+    {
+      "epoch": 0.86912,
+      "grad_norm": 0.33416199414926295,
+      "learning_rate": 8.85178321889908e-06,
+      "loss": 0.5528,
+      "step": 2716
+    },
+    {
+      "epoch": 0.86944,
+      "grad_norm": 0.37447740057728146,
+      "learning_rate": 8.809197365804401e-06,
+      "loss": 0.6337,
+      "step": 2717
+    },
+    {
+      "epoch": 0.86976,
+      "grad_norm": 0.336091885343065,
+      "learning_rate": 8.76670947949838e-06,
+      "loss": 0.565,
+      "step": 2718
+    },
+    {
+      "epoch": 0.87008,
+      "grad_norm": 0.34521337704582594,
+      "learning_rate": 8.72431960562594e-06,
+      "loss": 0.5484,
+      "step": 2719
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.35216516000853687,
+      "learning_rate": 8.68202778972681e-06,
+      "loss": 0.5447,
+      "step": 2720
+    },
+    {
+      "epoch": 0.87072,
+      "grad_norm": 0.32365542660433744,
+      "learning_rate": 8.639834077235266e-06,
+      "loss": 0.5384,
+      "step": 2721
+    },
+    {
+      "epoch": 0.87104,
+      "grad_norm": 0.3659657423997534,
+      "learning_rate": 8.597738513480302e-06,
+      "loss": 0.5619,
+      "step": 2722
+    },
+    {
+      "epoch": 0.87136,
+      "grad_norm": 0.3617534147684829,
+      "learning_rate": 8.555741143685381e-06,
+      "loss": 0.558,
+      "step": 2723
+    },
+    {
+      "epoch": 0.87168,
+      "grad_norm": 0.3370336801159234,
+      "learning_rate": 8.513842012968543e-06,
+      "loss": 0.5563,
+      "step": 2724
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.32865741894191847,
+      "learning_rate": 8.472041166342216e-06,
+      "loss": 0.5532,
+      "step": 2725
+    },
+    {
+      "epoch": 0.87232,
+      "grad_norm": 0.3187491837381094,
+      "learning_rate": 8.430338648713332e-06,
+      "loss": 0.5469,
+      "step": 2726
+    },
+    {
+      "epoch": 0.87264,
+      "grad_norm": 0.4030900100656536,
+      "learning_rate": 8.388734504883088e-06,
+      "loss": 0.6269,
+      "step": 2727
+    },
+    {
+      "epoch": 0.87296,
+      "grad_norm": 0.3622155743104728,
+      "learning_rate": 8.347228779547078e-06,
+      "loss": 0.5491,
+      "step": 2728
+    },
+    {
+      "epoch": 0.87328,
+      "grad_norm": 0.34111289607719547,
+      "learning_rate": 8.305821517295154e-06,
+      "loss": 0.5929,
+      "step": 2729
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.35619153641275386,
+      "learning_rate": 8.264512762611321e-06,
+      "loss": 0.5512,
+      "step": 2730
+    },
+    {
+      "epoch": 0.87392,
+      "grad_norm": 0.44335790091425353,
+      "learning_rate": 8.223302559873857e-06,
+      "loss": 0.5873,
+      "step": 2731
+    },
+    {
+      "epoch": 0.87424,
+      "grad_norm": 0.3340282323353771,
+      "learning_rate": 8.182190953355063e-06,
+      "loss": 0.5574,
+      "step": 2732
+    },
+    {
+      "epoch": 0.87456,
+      "grad_norm": 0.34343220247981865,
+      "learning_rate": 8.141177987221394e-06,
+      "loss": 0.5417,
+      "step": 2733
+    },
+    {
+      "epoch": 0.87488,
+      "grad_norm": 0.313879474653681,
+      "learning_rate": 8.100263705533317e-06,
+      "loss": 0.5172,
+      "step": 2734
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3594818762354667,
+      "learning_rate": 8.059448152245242e-06,
+      "loss": 0.5766,
+      "step": 2735
+    },
+    {
+      "epoch": 0.87552,
+      "grad_norm": 0.32364856195429936,
+      "learning_rate": 8.01873137120559e-06,
+      "loss": 0.5567,
+      "step": 2736
+    },
+    {
+      "epoch": 0.87584,
+      "grad_norm": 0.3749724363509122,
+      "learning_rate": 7.978113406156584e-06,
+      "loss": 0.5476,
+      "step": 2737
+    },
+    {
+      "epoch": 0.87616,
+      "grad_norm": 0.3472650981403551,
+      "learning_rate": 7.93759430073434e-06,
+      "loss": 0.5652,
+      "step": 2738
+    },
+    {
+      "epoch": 0.87648,
+      "grad_norm": 0.3533542922260968,
+      "learning_rate": 7.897174098468797e-06,
+      "loss": 0.5877,
+      "step": 2739
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.34917540582951834,
+      "learning_rate": 7.856852842783547e-06,
+      "loss": 0.5656,
+      "step": 2740
+    },
+    {
+      "epoch": 0.87712,
+      "grad_norm": 0.3623230991737994,
+      "learning_rate": 7.816630576995987e-06,
+      "loss": 0.5675,
+      "step": 2741
+    },
+    {
+      "epoch": 0.87744,
+      "grad_norm": 0.3508426262275108,
+      "learning_rate": 7.776507344317097e-06,
+      "loss": 0.5662,
+      "step": 2742
+    },
+    {
+      "epoch": 0.87776,
+      "grad_norm": 0.3533153531625396,
+      "learning_rate": 7.736483187851484e-06,
+      "loss": 0.5869,
+      "step": 2743
+    },
+    {
+      "epoch": 0.87808,
+      "grad_norm": 0.3706637973494316,
+      "learning_rate": 7.696558150597356e-06,
+      "loss": 0.5505,
+      "step": 2744
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.40496628588296135,
+      "learning_rate": 7.656732275446366e-06,
+      "loss": 0.6336,
+      "step": 2745
+    },
+    {
+      "epoch": 0.87872,
+      "grad_norm": 0.32218960924854395,
+      "learning_rate": 7.61700560518368e-06,
+      "loss": 0.5454,
+      "step": 2746
+    },
+    {
+      "epoch": 0.87904,
+      "grad_norm": 0.3303135408331546,
+      "learning_rate": 7.577378182487926e-06,
+      "loss": 0.5644,
+      "step": 2747
+    },
+    {
+      "epoch": 0.87936,
+      "grad_norm": 0.353959313934574,
+      "learning_rate": 7.537850049931006e-06,
+      "loss": 0.5964,
+      "step": 2748
+    },
+    {
+      "epoch": 0.87968,
+      "grad_norm": 0.3494529531061012,
+      "learning_rate": 7.498421249978249e-06,
+      "loss": 0.5526,
+      "step": 2749
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3695466281187068,
+      "learning_rate": 7.459091824988229e-06,
+      "loss": 0.5667,
+      "step": 2750
+    },
+    {
+      "epoch": 0.88032,
+      "grad_norm": 0.3394420751070245,
+      "learning_rate": 7.419861817212758e-06,
+      "loss": 0.5498,
+      "step": 2751
+    },
+    {
+      "epoch": 0.88064,
+      "grad_norm": 0.3751188943499607,
+      "learning_rate": 7.380731268796859e-06,
+      "loss": 0.6279,
+      "step": 2752
+    },
+    {
+      "epoch": 0.88096,
+      "grad_norm": 0.3474335882234515,
+      "learning_rate": 7.341700221778691e-06,
+      "loss": 0.613,
+      "step": 2753
+    },
+    {
+      "epoch": 0.88128,
+      "grad_norm": 0.3828786412677813,
+      "learning_rate": 7.3027687180895475e-06,
+      "loss": 0.6296,
+      "step": 2754
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.32437881156472254,
+      "learning_rate": 7.263936799553728e-06,
+      "loss": 0.5602,
+      "step": 2755
+    },
+    {
+      "epoch": 0.88192,
+      "grad_norm": 0.3295470948671417,
+      "learning_rate": 7.2252045078885945e-06,
+      "loss": 0.5965,
+      "step": 2756
+    },
+    {
+      "epoch": 0.88224,
+      "grad_norm": 0.3635410050690401,
+      "learning_rate": 7.186571884704474e-06,
+      "loss": 0.6021,
+      "step": 2757
+    },
+    {
+      "epoch": 0.88256,
+      "grad_norm": 0.33213646315506595,
+      "learning_rate": 7.1480389715046e-06,
+      "loss": 0.5428,
+      "step": 2758
+    },
+    {
+      "epoch": 0.88288,
+      "grad_norm": 0.3653316523278171,
+      "learning_rate": 7.109605809685094e-06,
+      "loss": 0.5797,
+      "step": 2759
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.33986015532916414,
+      "learning_rate": 7.071272440534937e-06,
+      "loss": 0.5734,
+      "step": 2760
+    },
+    {
+      "epoch": 0.88352,
+      "grad_norm": 0.3552870208145124,
+      "learning_rate": 7.033038905235845e-06,
+      "loss": 0.575,
+      "step": 2761
+    },
+    {
+      "epoch": 0.88384,
+      "grad_norm": 0.3576248551259227,
+      "learning_rate": 6.994905244862349e-06,
+      "loss": 0.587,
+      "step": 2762
+    },
+    {
+      "epoch": 0.88416,
+      "grad_norm": 0.3636714688443094,
+      "learning_rate": 6.956871500381634e-06,
+      "loss": 0.577,
+      "step": 2763
+    },
+    {
+      "epoch": 0.88448,
+      "grad_norm": 0.3482394817957405,
+      "learning_rate": 6.918937712653584e-06,
+      "loss": 0.5612,
+      "step": 2764
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.34634732600544005,
+      "learning_rate": 6.881103922430665e-06,
+      "loss": 0.561,
+      "step": 2765
+    },
+    {
+      "epoch": 0.88512,
+      "grad_norm": 0.375902593868274,
+      "learning_rate": 6.843370170357932e-06,
+      "loss": 0.555,
+      "step": 2766
+    },
+    {
+      "epoch": 0.88544,
+      "grad_norm": 0.3376862125987125,
+      "learning_rate": 6.80573649697297e-06,
+      "loss": 0.5674,
+      "step": 2767
+    },
+    {
+      "epoch": 0.88576,
+      "grad_norm": 0.861545912519331,
+      "learning_rate": 6.7682029427058365e-06,
+      "loss": 0.5946,
+      "step": 2768
+    },
+    {
+      "epoch": 0.88608,
+      "grad_norm": 0.34865465601876977,
+      "learning_rate": 6.7307695478790345e-06,
+      "loss": 0.546,
+      "step": 2769
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.34870578512428874,
+      "learning_rate": 6.693436352707494e-06,
+      "loss": 0.5892,
+      "step": 2770
+    },
+    {
+      "epoch": 0.88672,
+      "grad_norm": 0.35889278838134153,
+      "learning_rate": 6.656203397298433e-06,
+      "loss": 0.5831,
+      "step": 2771
+    },
+    {
+      "epoch": 0.88704,
+      "grad_norm": 0.33848557579498934,
+      "learning_rate": 6.619070721651477e-06,
+      "loss": 0.5757,
+      "step": 2772
+    },
+    {
+      "epoch": 0.88736,
+      "grad_norm": 0.4172341171161688,
+      "learning_rate": 6.5820383656584165e-06,
+      "loss": 0.5692,
+      "step": 2773
+    },
+    {
+      "epoch": 0.88768,
+      "grad_norm": 0.3879464582293012,
+      "learning_rate": 6.545106369103349e-06,
+      "loss": 0.5605,
+      "step": 2774
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.350582038472011,
+      "learning_rate": 6.5082747716625255e-06,
+      "loss": 0.558,
+      "step": 2775
+    },
+    {
+      "epoch": 0.88832,
+      "grad_norm": 0.3329044895418247,
+      "learning_rate": 6.471543612904319e-06,
+      "loss": 0.576,
+      "step": 2776
+    },
+    {
+      "epoch": 0.88864,
+      "grad_norm": 0.3491995049014739,
+      "learning_rate": 6.434912932289228e-06,
+      "loss": 0.5561,
+      "step": 2777
+    },
+    {
+      "epoch": 0.88896,
+      "grad_norm": 0.3300639753565361,
+      "learning_rate": 6.398382769169786e-06,
+      "loss": 0.5523,
+      "step": 2778
+    },
+    {
+      "epoch": 0.88928,
+      "grad_norm": 0.3490842110958772,
+      "learning_rate": 6.3619531627905904e-06,
+      "loss": 0.5391,
+      "step": 2779
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3439085395427416,
+      "learning_rate": 6.325624152288123e-06,
+      "loss": 0.5892,
+      "step": 2780
+    },
+    {
+      "epoch": 0.88992,
+      "grad_norm": 0.34509556617704645,
+      "learning_rate": 6.289395776690854e-06,
+      "loss": 0.6108,
+      "step": 2781
+    },
+    {
+      "epoch": 0.89024,
+      "grad_norm": 0.33386783087969407,
+      "learning_rate": 6.253268074919138e-06,
+      "loss": 0.5524,
+      "step": 2782
+    },
+    {
+      "epoch": 0.89056,
+      "grad_norm": 0.37653527935550135,
+      "learning_rate": 6.217241085785186e-06,
+      "loss": 0.5884,
+      "step": 2783
+    },
+    {
+      "epoch": 0.89088,
+      "grad_norm": 0.32243983124289327,
+      "learning_rate": 6.181314847992959e-06,
+      "loss": 0.5435,
+      "step": 2784
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3421011995759712,
+      "learning_rate": 6.145489400138238e-06,
+      "loss": 0.5316,
+      "step": 2785
+    },
+    {
+      "epoch": 0.89152,
+      "grad_norm": 0.37015341445689787,
+      "learning_rate": 6.109764780708482e-06,
+      "loss": 0.5693,
+      "step": 2786
+    },
+    {
+      "epoch": 0.89184,
+      "grad_norm": 0.34810680782887027,
+      "learning_rate": 6.074141028082858e-06,
+      "loss": 0.5555,
+      "step": 2787
+    },
+    {
+      "epoch": 0.89216,
+      "grad_norm": 0.36949507551060756,
+      "learning_rate": 6.038618180532174e-06,
+      "loss": 0.571,
+      "step": 2788
+    },
+    {
+      "epoch": 0.89248,
+      "grad_norm": 0.3991748582992206,
+      "learning_rate": 6.003196276218814e-06,
+      "loss": 0.5827,
+      "step": 2789
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3215523687754182,
+      "learning_rate": 5.9678753531967165e-06,
+      "loss": 0.5765,
+      "step": 2790
+    },
+    {
+      "epoch": 0.89312,
+      "grad_norm": 0.32743649458301116,
+      "learning_rate": 5.932655449411384e-06,
+      "loss": 0.5618,
+      "step": 2791
+    },
+    {
+      "epoch": 0.89344,
+      "grad_norm": 0.35078959112555913,
+      "learning_rate": 5.8975366026997046e-06,
+      "loss": 0.6208,
+      "step": 2792
+    },
+    {
+      "epoch": 0.89376,
+      "grad_norm": 0.34974224167142387,
+      "learning_rate": 5.862518850790099e-06,
+      "loss": 0.5734,
+      "step": 2793
+    },
+    {
+      "epoch": 0.89408,
+      "grad_norm": 0.33248483049491073,
+      "learning_rate": 5.8276022313022875e-06,
+      "loss": 0.5415,
+      "step": 2794
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.33611933596137067,
+      "learning_rate": 5.792786781747428e-06,
+      "loss": 0.5702,
+      "step": 2795
+    },
+    {
+      "epoch": 0.89472,
+      "grad_norm": 0.3501405426469906,
+      "learning_rate": 5.7580725395279366e-06,
+      "loss": 0.5837,
+      "step": 2796
+    },
+    {
+      "epoch": 0.89504,
+      "grad_norm": 0.3528040892370007,
+      "learning_rate": 5.723459541937515e-06,
+      "loss": 0.6133,
+      "step": 2797
+    },
+    {
+      "epoch": 0.89536,
+      "grad_norm": 0.354212207145996,
+      "learning_rate": 5.688947826161117e-06,
+      "loss": 0.6047,
+      "step": 2798
+    },
+    {
+      "epoch": 0.89568,
+      "grad_norm": 0.3704975451646289,
+      "learning_rate": 5.654537429274842e-06,
+      "loss": 0.6079,
+      "step": 2799
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3448700631938704,
+      "learning_rate": 5.620228388245996e-06,
+      "loss": 0.607,
+      "step": 2800
+    },
+    {
+      "epoch": 0.89632,
+      "grad_norm": 0.33290262588162295,
+      "learning_rate": 5.586020739932973e-06,
+      "loss": 0.5606,
+      "step": 2801
+    },
+    {
+      "epoch": 0.89664,
+      "grad_norm": 0.3398627304422116,
+      "learning_rate": 5.5519145210852105e-06,
+      "loss": 0.5032,
+      "step": 2802
+    },
+    {
+      "epoch": 0.89696,
+      "grad_norm": 0.33170687758390277,
+      "learning_rate": 5.517909768343254e-06,
+      "loss": 0.5335,
+      "step": 2803
+    },
+    {
+      "epoch": 0.89728,
+      "grad_norm": 0.37169788403853293,
+      "learning_rate": 5.4840065182385716e-06,
+      "loss": 0.5569,
+      "step": 2804
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.34250040944622306,
+      "learning_rate": 5.450204807193626e-06,
+      "loss": 0.5897,
+      "step": 2805
+    },
+    {
+      "epoch": 0.89792,
+      "grad_norm": 0.3676708902237842,
+      "learning_rate": 5.416504671521772e-06,
+      "loss": 0.6352,
+      "step": 2806
+    },
+    {
+      "epoch": 0.89824,
+      "grad_norm": 0.36384276177088926,
+      "learning_rate": 5.382906147427269e-06,
+      "loss": 0.6064,
+      "step": 2807
+    },
+    {
+      "epoch": 0.89856,
+      "grad_norm": 0.34099146203541597,
+      "learning_rate": 5.349409271005168e-06,
+      "loss": 0.5474,
+      "step": 2808
+    },
+    {
+      "epoch": 0.89888,
+      "grad_norm": 0.3345924397023783,
+      "learning_rate": 5.316014078241393e-06,
+      "loss": 0.583,
+      "step": 2809
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3335290728397917,
+      "learning_rate": 5.2827206050125585e-06,
+      "loss": 0.5806,
+      "step": 2810
+    },
+    {
+      "epoch": 0.89952,
+      "grad_norm": 0.3602870780474008,
+      "learning_rate": 5.249528887086052e-06,
+      "loss": 0.5963,
+      "step": 2811
+    },
+    {
+      "epoch": 0.89984,
+      "grad_norm": 0.33506594255551925,
+      "learning_rate": 5.216438960119885e-06,
+      "loss": 0.5606,
+      "step": 2812
+    },
+    {
+      "epoch": 0.90016,
+      "grad_norm": 0.34868184719748685,
+      "learning_rate": 5.183450859662764e-06,
+      "loss": 0.5765,
+      "step": 2813
+    },
+    {
+      "epoch": 0.90048,
+      "grad_norm": 0.3550045997969656,
+      "learning_rate": 5.150564621154019e-06,
+      "loss": 0.5898,
+      "step": 2814
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.33614302670948576,
+      "learning_rate": 5.11778027992349e-06,
+      "loss": 0.5363,
+      "step": 2815
+    },
+    {
+      "epoch": 0.90112,
+      "grad_norm": 0.3614194175997036,
+      "learning_rate": 5.085097871191591e-06,
+      "loss": 0.5268,
+      "step": 2816
+    },
+    {
+      "epoch": 0.90144,
+      "grad_norm": 0.3814564448995357,
+      "learning_rate": 5.052517430069204e-06,
+      "loss": 0.6076,
+      "step": 2817
+    },
+    {
+      "epoch": 0.90176,
+      "grad_norm": 0.36852320553559564,
+      "learning_rate": 5.020038991557674e-06,
+      "loss": 0.5706,
+      "step": 2818
+    },
+    {
+      "epoch": 0.90208,
+      "grad_norm": 0.34752850994780604,
+      "learning_rate": 4.9876625905488025e-06,
+      "loss": 0.5648,
+      "step": 2819
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.36553003238350545,
+      "learning_rate": 4.955388261824712e-06,
+      "loss": 0.5857,
+      "step": 2820
+    },
+    {
+      "epoch": 0.90272,
+      "grad_norm": 0.34547469445158907,
+      "learning_rate": 4.923216040057887e-06,
+      "loss": 0.5966,
+      "step": 2821
+    },
+    {
+      "epoch": 0.90304,
+      "grad_norm": 0.3577917172679757,
+      "learning_rate": 4.89114595981115e-06,
+      "loss": 0.5645,
+      "step": 2822
+    },
+    {
+      "epoch": 0.90336,
+      "grad_norm": 0.3451850821887828,
+      "learning_rate": 4.859178055537539e-06,
+      "loss": 0.5389,
+      "step": 2823
+    },
+    {
+      "epoch": 0.90368,
+      "grad_norm": 0.34628777076126466,
+      "learning_rate": 4.8273123615803825e-06,
+      "loss": 0.5482,
+      "step": 2824
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.34300289902896913,
+      "learning_rate": 4.7955489121731535e-06,
+      "loss": 0.5727,
+      "step": 2825
+    },
+    {
+      "epoch": 0.90432,
+      "grad_norm": 0.3678335467865954,
+      "learning_rate": 4.763887741439499e-06,
+      "loss": 0.6289,
+      "step": 2826
+    },
+    {
+      "epoch": 0.90464,
+      "grad_norm": 0.3377063031338014,
+      "learning_rate": 4.732328883393211e-06,
+      "loss": 0.582,
+      "step": 2827
+    },
+    {
+      "epoch": 0.90496,
+      "grad_norm": 0.3442615691288297,
+      "learning_rate": 4.700872371938125e-06,
+      "loss": 0.5621,
+      "step": 2828
+    },
+    {
+      "epoch": 0.90528,
+      "grad_norm": 0.3340253158301437,
+      "learning_rate": 4.669518240868176e-06,
+      "loss": 0.5284,
+      "step": 2829
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.33567183285964836,
+      "learning_rate": 4.6382665238672405e-06,
+      "loss": 0.5576,
+      "step": 2830
+    },
+    {
+      "epoch": 0.90592,
+      "grad_norm": 0.34379715853428794,
+      "learning_rate": 4.607117254509241e-06,
+      "loss": 0.581,
+      "step": 2831
+    },
+    {
+      "epoch": 0.90624,
+      "grad_norm": 0.32591788634552477,
+      "learning_rate": 4.57607046625802e-06,
+      "loss": 0.5321,
+      "step": 2832
+    },
+    {
+      "epoch": 0.90656,
+      "grad_norm": 0.38041021998211644,
+      "learning_rate": 4.545126192467308e-06,
+      "loss": 0.6333,
+      "step": 2833
+    },
+    {
+      "epoch": 0.90688,
+      "grad_norm": 0.33676411330147826,
+      "learning_rate": 4.514284466380692e-06,
+      "loss": 0.5579,
+      "step": 2834
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3343253948899122,
+      "learning_rate": 4.483545321131632e-06,
+      "loss": 0.5703,
+      "step": 2835
+    },
+    {
+      "epoch": 0.90752,
+      "grad_norm": 0.3656624643540776,
+      "learning_rate": 4.452908789743337e-06,
+      "loss": 0.5768,
+      "step": 2836
+    },
+    {
+      "epoch": 0.90784,
+      "grad_norm": 0.32530824513401746,
+      "learning_rate": 4.422374905128846e-06,
+      "loss": 0.5652,
+      "step": 2837
+    },
+    {
+      "epoch": 0.90816,
+      "grad_norm": 0.36160556475942635,
+      "learning_rate": 4.391943700090839e-06,
+      "loss": 0.5405,
+      "step": 2838
+    },
+    {
+      "epoch": 0.90848,
+      "grad_norm": 0.3410437113763867,
+      "learning_rate": 4.361615207321756e-06,
+      "loss": 0.5786,
+      "step": 2839
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.367931721500388,
+      "learning_rate": 4.331389459403668e-06,
+      "loss": 0.5827,
+      "step": 2840
+    },
+    {
+      "epoch": 0.90912,
+      "grad_norm": 0.3716330792114727,
+      "learning_rate": 4.3012664888082424e-06,
+      "loss": 0.6251,
+      "step": 2841
+    },
+    {
+      "epoch": 0.90944,
+      "grad_norm": 0.3323411659132477,
+      "learning_rate": 4.271246327896783e-06,
+      "loss": 0.5965,
+      "step": 2842
+    },
+    {
+      "epoch": 0.90976,
+      "grad_norm": 0.3369799588012157,
+      "learning_rate": 4.241329008920081e-06,
+      "loss": 0.5185,
+      "step": 2843
+    },
+    {
+      "epoch": 0.91008,
+      "grad_norm": 0.3376043256633684,
+      "learning_rate": 4.211514564018515e-06,
+      "loss": 0.5655,
+      "step": 2844
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.33634762663920625,
+      "learning_rate": 4.181803025221898e-06,
+      "loss": 0.5191,
+      "step": 2845
+    },
+    {
+      "epoch": 0.91072,
+      "grad_norm": 0.3458550837125521,
+      "learning_rate": 4.152194424449485e-06,
+      "loss": 0.5759,
+      "step": 2846
+    },
+    {
+      "epoch": 0.91104,
+      "grad_norm": 0.3768007158563421,
+      "learning_rate": 4.122688793509988e-06,
+      "loss": 0.5652,
+      "step": 2847
+    },
+    {
+      "epoch": 0.91136,
+      "grad_norm": 0.3585454547843405,
+      "learning_rate": 4.0932861641014394e-06,
+      "loss": 0.5752,
+      "step": 2848
+    },
+    {
+      "epoch": 0.91168,
+      "grad_norm": 0.3341302029503999,
+      "learning_rate": 4.063986567811273e-06,
+      "loss": 0.5889,
+      "step": 2849
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.34103806009772053,
+      "learning_rate": 4.034790036116209e-06,
+      "loss": 0.5406,
+      "step": 2850
+    },
+    {
+      "epoch": 0.91232,
+      "grad_norm": 0.34257989757405,
+      "learning_rate": 4.005696600382236e-06,
+      "loss": 0.5369,
+      "step": 2851
+    },
+    {
+      "epoch": 0.91264,
+      "grad_norm": 0.3246156810822316,
+      "learning_rate": 3.976706291864596e-06,
+      "loss": 0.5641,
+      "step": 2852
+    },
+    {
+      "epoch": 0.91296,
+      "grad_norm": 0.34452475901077434,
+      "learning_rate": 3.947819141707742e-06,
+      "loss": 0.552,
+      "step": 2853
+    },
+    {
+      "epoch": 0.91328,
+      "grad_norm": 0.32860189656766997,
+      "learning_rate": 3.919035180945297e-06,
+      "loss": 0.563,
+      "step": 2854
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.35123397263055917,
+      "learning_rate": 3.890354440500032e-06,
+      "loss": 0.582,
+      "step": 2855
+    },
+    {
+      "epoch": 0.91392,
+      "grad_norm": 0.3599111154000626,
+      "learning_rate": 3.8617769511838264e-06,
+      "loss": 0.5754,
+      "step": 2856
+    },
+    {
+      "epoch": 0.91424,
+      "grad_norm": 0.3723849372923179,
+      "learning_rate": 3.833302743697631e-06,
+      "loss": 0.5771,
+      "step": 2857
+    },
+    {
+      "epoch": 0.91456,
+      "grad_norm": 0.34801736815451534,
+      "learning_rate": 3.8049318486314657e-06,
+      "loss": 0.5659,
+      "step": 2858
+    },
+    {
+      "epoch": 0.91488,
+      "grad_norm": 0.499823699582421,
+      "learning_rate": 3.776664296464316e-06,
+      "loss": 0.5526,
+      "step": 2859
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.33305508076608026,
+      "learning_rate": 3.748500117564191e-06,
+      "loss": 0.5425,
+      "step": 2860
+    },
+    {
+      "epoch": 0.91552,
+      "grad_norm": 0.359265150726778,
+      "learning_rate": 3.7204393421880203e-06,
+      "loss": 0.5856,
+      "step": 2861
+    },
+    {
+      "epoch": 0.91584,
+      "grad_norm": 0.3511973531693837,
+      "learning_rate": 3.692482000481645e-06,
+      "loss": 0.6103,
+      "step": 2862
+    },
+    {
+      "epoch": 0.91616,
+      "grad_norm": 0.3485342444756007,
+      "learning_rate": 3.6646281224798075e-06,
+      "loss": 0.6073,
+      "step": 2863
+    },
+    {
+      "epoch": 0.91648,
+      "grad_norm": 0.3331230979882821,
+      "learning_rate": 3.6368777381060483e-06,
+      "loss": 0.5628,
+      "step": 2864
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.360453748430889,
+      "learning_rate": 3.609230877172798e-06,
+      "loss": 0.519,
+      "step": 2865
+    },
+    {
+      "epoch": 0.91712,
+      "grad_norm": 0.35306952896812904,
+      "learning_rate": 3.5816875693812314e-06,
+      "loss": 0.5667,
+      "step": 2866
+    },
+    {
+      "epoch": 0.91744,
+      "grad_norm": 0.3479611248165948,
+      "learning_rate": 3.554247844321257e-06,
+      "loss": 0.6038,
+      "step": 2867
+    },
+    {
+      "epoch": 0.91776,
+      "grad_norm": 0.32958588483131146,
+      "learning_rate": 3.5269117314715495e-06,
+      "loss": 0.5415,
+      "step": 2868
+    },
+    {
+      "epoch": 0.91808,
+      "grad_norm": 0.3472610183107072,
+      "learning_rate": 3.4996792601994287e-06,
+      "loss": 0.5858,
+      "step": 2869
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3343199575343456,
+      "learning_rate": 3.4725504597608816e-06,
+      "loss": 0.5404,
+      "step": 2870
+    },
+    {
+      "epoch": 0.91872,
+      "grad_norm": 0.33591339801874,
+      "learning_rate": 3.445525359300561e-06,
+      "loss": 0.587,
+      "step": 2871
+    },
+    {
+      "epoch": 0.91904,
+      "grad_norm": 0.3519781818579308,
+      "learning_rate": 3.4186039878516653e-06,
+      "loss": 0.559,
+      "step": 2872
+    },
+    {
+      "epoch": 0.91936,
+      "grad_norm": 0.3655017032831667,
+      "learning_rate": 3.3917863743359813e-06,
+      "loss": 0.5908,
+      "step": 2873
+    },
+    {
+      "epoch": 0.91968,
+      "grad_norm": 0.33429192412287345,
+      "learning_rate": 3.365072547563797e-06,
+      "loss": 0.5728,
+      "step": 2874
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.34623141842115757,
+      "learning_rate": 3.338462536233955e-06,
+      "loss": 0.5545,
+      "step": 2875
+    },
+    {
+      "epoch": 0.92032,
+      "grad_norm": 0.3384627725450504,
+      "learning_rate": 3.311956368933733e-06,
+      "loss": 0.5382,
+      "step": 2876
+    },
+    {
+      "epoch": 0.92064,
+      "grad_norm": 0.5896368414027378,
+      "learning_rate": 3.2855540741388414e-06,
+      "loss": 0.5648,
+      "step": 2877
+    },
+    {
+      "epoch": 0.92096,
+      "grad_norm": 0.3486875469918097,
+      "learning_rate": 3.2592556802134244e-06,
+      "loss": 0.5481,
+      "step": 2878
+    },
+    {
+      "epoch": 0.92128,
+      "grad_norm": 0.34310645471152623,
+      "learning_rate": 3.2330612154099936e-06,
+      "loss": 0.568,
+      "step": 2879
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.35614170986681676,
+      "learning_rate": 3.2069707078694057e-06,
+      "loss": 0.5888,
+      "step": 2880
+    },
+    {
+      "epoch": 0.92192,
+      "grad_norm": 0.3167398265351191,
+      "learning_rate": 3.180984185620839e-06,
+      "loss": 0.5221,
+      "step": 2881
+    },
+    {
+      "epoch": 0.92224,
+      "grad_norm": 0.3700634777584648,
+      "learning_rate": 3.155101676581762e-06,
+      "loss": 0.5251,
+      "step": 2882
+    },
+    {
+      "epoch": 0.92256,
+      "grad_norm": 0.38675264625911526,
+      "learning_rate": 3.1293232085578883e-06,
+      "loss": 0.6143,
+      "step": 2883
+    },
+    {
+      "epoch": 0.92288,
+      "grad_norm": 0.3400966490316834,
+      "learning_rate": 3.103648809243187e-06,
+      "loss": 0.5509,
+      "step": 2884
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.36867499153736294,
+      "learning_rate": 3.078078506219784e-06,
+      "loss": 0.5717,
+      "step": 2885
+    },
+    {
+      "epoch": 0.92352,
+      "grad_norm": 0.3229408702567602,
+      "learning_rate": 3.0526123269580377e-06,
+      "loss": 0.5635,
+      "step": 2886
+    },
+    {
+      "epoch": 0.92384,
+      "grad_norm": 0.39804029753224696,
+      "learning_rate": 3.027250298816364e-06,
+      "loss": 0.5803,
+      "step": 2887
+    },
+    {
+      "epoch": 0.92416,
+      "grad_norm": 0.36141868305260705,
+      "learning_rate": 3.0019924490413685e-06,
+      "loss": 0.5929,
+      "step": 2888
+    },
+    {
+      "epoch": 0.92448,
+      "grad_norm": 0.340395146432603,
+      "learning_rate": 2.976838804767668e-06,
+      "loss": 0.5675,
+      "step": 2889
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.33833354131370824,
+      "learning_rate": 2.9517893930179785e-06,
+      "loss": 0.5409,
+      "step": 2890
+    },
+    {
+      "epoch": 0.92512,
+      "grad_norm": 0.34764889418401584,
+      "learning_rate": 2.9268442407030196e-06,
+      "loss": 0.5576,
+      "step": 2891
+    },
+    {
+      "epoch": 0.92544,
+      "grad_norm": 0.34512214030953275,
+      "learning_rate": 2.9020033746215313e-06,
+      "loss": 0.6083,
+      "step": 2892
+    },
+    {
+      "epoch": 0.92576,
+      "grad_norm": 0.34386328514826,
+      "learning_rate": 2.877266821460145e-06,
+      "loss": 0.5805,
+      "step": 2893
+    },
+    {
+      "epoch": 0.92608,
+      "grad_norm": 0.3675681102152316,
+      "learning_rate": 2.852634607793525e-06,
+      "loss": 0.5813,
+      "step": 2894
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3387088572847811,
+      "learning_rate": 2.8281067600841705e-06,
+      "loss": 0.5569,
+      "step": 2895
+    },
+    {
+      "epoch": 0.92672,
+      "grad_norm": 0.3384830103818643,
+      "learning_rate": 2.8036833046824917e-06,
+      "loss": 0.533,
+      "step": 2896
+    },
+    {
+      "epoch": 0.92704,
+      "grad_norm": 0.34017090324653076,
+      "learning_rate": 2.7793642678267563e-06,
+      "loss": 0.5812,
+      "step": 2897
+    },
+    {
+      "epoch": 0.92736,
+      "grad_norm": 0.3375396695080064,
+      "learning_rate": 2.7551496756430094e-06,
+      "loss": 0.5948,
+      "step": 2898
+    },
+    {
+      "epoch": 0.92768,
+      "grad_norm": 0.3714118755082262,
+      "learning_rate": 2.731039554145165e-06,
+      "loss": 0.5902,
+      "step": 2899
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.36010523570409236,
+      "learning_rate": 2.7070339292348147e-06,
+      "loss": 0.5735,
+      "step": 2900
+    },
+    {
+      "epoch": 0.92832,
+      "grad_norm": 0.3445063157061339,
+      "learning_rate": 2.6831328267013624e-06,
+      "loss": 0.5686,
+      "step": 2901
+    },
+    {
+      "epoch": 0.92864,
+      "grad_norm": 0.33452317572382473,
+      "learning_rate": 2.659336272221913e-06,
+      "loss": 0.5453,
+      "step": 2902
+    },
+    {
+      "epoch": 0.92896,
+      "grad_norm": 0.34418474725527026,
+      "learning_rate": 2.6356442913612054e-06,
+      "loss": 0.6022,
+      "step": 2903
+    },
+    {
+      "epoch": 0.92928,
+      "grad_norm": 0.3417053895305936,
+      "learning_rate": 2.6120569095716806e-06,
+      "loss": 0.5401,
+      "step": 2904
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4323828368821946,
+      "learning_rate": 2.5885741521933902e-06,
+      "loss": 0.5795,
+      "step": 2905
+    },
+    {
+      "epoch": 0.92992,
+      "grad_norm": 0.34626103032463873,
+      "learning_rate": 2.565196044453988e-06,
+      "loss": 0.5449,
+      "step": 2906
+    },
+    {
+      "epoch": 0.93024,
+      "grad_norm": 0.3482572315889939,
+      "learning_rate": 2.541922611468728e-06,
+      "loss": 0.5427,
+      "step": 2907
+    },
+    {
+      "epoch": 0.93056,
+      "grad_norm": 0.34881581725772226,
+      "learning_rate": 2.518753878240365e-06,
+      "loss": 0.5575,
+      "step": 2908
+    },
+    {
+      "epoch": 0.93088,
+      "grad_norm": 0.3634452371924472,
+      "learning_rate": 2.4956898696592124e-06,
+      "loss": 0.5936,
+      "step": 2909
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.32644715446893524,
+      "learning_rate": 2.47273061050306e-06,
+      "loss": 0.5412,
+      "step": 2910
+    },
+    {
+      "epoch": 0.93152,
+      "grad_norm": 0.3472435572486151,
+      "learning_rate": 2.449876125437156e-06,
+      "loss": 0.5803,
+      "step": 2911
+    },
+    {
+      "epoch": 0.93184,
+      "grad_norm": 0.33880253692206175,
+      "learning_rate": 2.4271264390142267e-06,
+      "loss": 0.6003,
+      "step": 2912
+    },
+    {
+      "epoch": 0.93216,
+      "grad_norm": 0.371720063380355,
+      "learning_rate": 2.4044815756743553e-06,
+      "loss": 0.6396,
+      "step": 2913
+    },
+    {
+      "epoch": 0.93248,
+      "grad_norm": 0.32725071098545716,
+      "learning_rate": 2.3819415597450825e-06,
+      "loss": 0.5768,
+      "step": 2914
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3479775228639434,
+      "learning_rate": 2.3595064154412374e-06,
+      "loss": 0.583,
+      "step": 2915
+    },
+    {
+      "epoch": 0.93312,
+      "grad_norm": 0.37140340929061805,
+      "learning_rate": 2.3371761668650404e-06,
+      "loss": 0.5962,
+      "step": 2916
+    },
+    {
+      "epoch": 0.93344,
+      "grad_norm": 0.338309123875935,
+      "learning_rate": 2.3149508380060025e-06,
+      "loss": 0.5742,
+      "step": 2917
+    },
+    {
+      "epoch": 0.93376,
+      "grad_norm": 0.4000009841628731,
+      "learning_rate": 2.2928304527409127e-06,
+      "loss": 0.5643,
+      "step": 2918
+    },
+    {
+      "epoch": 0.93408,
+      "grad_norm": 0.34963884051711597,
+      "learning_rate": 2.2708150348338176e-06,
+      "loss": 0.5929,
+      "step": 2919
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.34593758884301407,
+      "learning_rate": 2.2489046079360198e-06,
+      "loss": 0.5518,
+      "step": 2920
+    },
+    {
+      "epoch": 0.93472,
+      "grad_norm": 0.36935830768837036,
+      "learning_rate": 2.227099195586002e-06,
+      "loss": 0.6002,
+      "step": 2921
+    },
+    {
+      "epoch": 0.93504,
+      "grad_norm": 0.3593240870270949,
+      "learning_rate": 2.205398821209459e-06,
+      "loss": 0.5714,
+      "step": 2922
+    },
+    {
+      "epoch": 0.93536,
+      "grad_norm": 0.355966251151385,
+      "learning_rate": 2.1838035081191866e-06,
+      "loss": 0.6101,
+      "step": 2923
+    },
+    {
+      "epoch": 0.93568,
+      "grad_norm": 0.343728318953142,
+      "learning_rate": 2.1623132795151824e-06,
+      "loss": 0.5109,
+      "step": 2924
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.34648754539327065,
+      "learning_rate": 2.140928158484523e-06,
+      "loss": 0.5327,
+      "step": 2925
+    },
+    {
+      "epoch": 0.93632,
+      "grad_norm": 0.3749897065975177,
+      "learning_rate": 2.1196481680013314e-06,
+      "loss": 0.5875,
+      "step": 2926
+    },
+    {
+      "epoch": 0.93664,
+      "grad_norm": 0.3841768188244641,
+      "learning_rate": 2.0984733309268424e-06,
+      "loss": 0.5799,
+      "step": 2927
+    },
+    {
+      "epoch": 0.93696,
+      "grad_norm": 0.352868203688293,
+      "learning_rate": 2.0774036700093036e-06,
+      "loss": 0.6127,
+      "step": 2928
+    },
+    {
+      "epoch": 0.93728,
+      "grad_norm": 0.36784058805691916,
+      "learning_rate": 2.0564392078839644e-06,
+      "loss": 0.6259,
+      "step": 2929
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.35661481974517606,
+      "learning_rate": 2.0355799670730645e-06,
+      "loss": 0.5455,
+      "step": 2930
+    },
+    {
+      "epoch": 0.93792,
+      "grad_norm": 0.34117875897031086,
+      "learning_rate": 2.0148259699857895e-06,
+      "loss": 0.5877,
+      "step": 2931
+    },
+    {
+      "epoch": 0.93824,
+      "grad_norm": 0.33847074964350893,
+      "learning_rate": 1.9941772389182935e-06,
+      "loss": 0.5019,
+      "step": 2932
+    },
+    {
+      "epoch": 0.93856,
+      "grad_norm": 0.3325235779600918,
+      "learning_rate": 1.97363379605362e-06,
+      "loss": 0.5952,
+      "step": 2933
+    },
+    {
+      "epoch": 0.93888,
+      "grad_norm": 0.3437097787712463,
+      "learning_rate": 1.9531956634617044e-06,
+      "loss": 0.5719,
+      "step": 2934
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.3338308091140957,
+      "learning_rate": 1.9328628630993386e-06,
+      "loss": 0.5753,
+      "step": 2935
+    },
+    {
+      "epoch": 0.93952,
+      "grad_norm": 0.3357846986464912,
+      "learning_rate": 1.91263541681016e-06,
+      "loss": 0.5668,
+      "step": 2936
+    },
+    {
+      "epoch": 0.93984,
+      "grad_norm": 0.36589773617524324,
+      "learning_rate": 1.8925133463246425e-06,
+      "loss": 0.5296,
+      "step": 2937
+    },
+    {
+      "epoch": 0.94016,
+      "grad_norm": 0.34741287859319475,
+      "learning_rate": 1.872496673260038e-06,
+      "loss": 0.5665,
+      "step": 2938
+    },
+    {
+      "epoch": 0.94048,
+      "grad_norm": 0.36330777305164874,
+      "learning_rate": 1.8525854191203562e-06,
+      "loss": 0.6009,
+      "step": 2939
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.34104369081316266,
+      "learning_rate": 1.8327796052963752e-06,
+      "loss": 0.5838,
+      "step": 2940
+    },
+    {
+      "epoch": 0.94112,
+      "grad_norm": 0.3558130290045266,
+      "learning_rate": 1.813079253065597e-06,
+      "loss": 0.5918,
+      "step": 2941
+    },
+    {
+      "epoch": 0.94144,
+      "grad_norm": 0.34951431053086685,
+      "learning_rate": 1.7934843835922144e-06,
+      "loss": 0.5879,
+      "step": 2942
+    },
+    {
+      "epoch": 0.94176,
+      "grad_norm": 0.33169653160506357,
+      "learning_rate": 1.7739950179271103e-06,
+      "loss": 0.5474,
+      "step": 2943
+    },
+    {
+      "epoch": 0.94208,
+      "grad_norm": 0.3340890778759615,
+      "learning_rate": 1.7546111770078144e-06,
+      "loss": 0.5509,
+      "step": 2944
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3294918454159986,
+      "learning_rate": 1.7353328816584913e-06,
+      "loss": 0.5574,
+      "step": 2945
+    },
+    {
+      "epoch": 0.94272,
+      "grad_norm": 0.3378355069960122,
+      "learning_rate": 1.7161601525899407e-06,
+      "loss": 0.5605,
+      "step": 2946
+    },
+    {
+      "epoch": 0.94304,
+      "grad_norm": 0.347360388900877,
+      "learning_rate": 1.6970930103994974e-06,
+      "loss": 0.6079,
+      "step": 2947
+    },
+    {
+      "epoch": 0.94336,
+      "grad_norm": 0.5379559421601664,
+      "learning_rate": 1.6781314755711319e-06,
+      "loss": 0.5738,
+      "step": 2948
+    },
+    {
+      "epoch": 0.94368,
+      "grad_norm": 0.35090601722345,
+      "learning_rate": 1.6592755684753047e-06,
+      "loss": 0.5876,
+      "step": 2949
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.35234845654112756,
+      "learning_rate": 1.6405253093690343e-06,
+      "loss": 0.5872,
+      "step": 2950
+    },
+    {
+      "epoch": 0.94432,
+      "grad_norm": 0.3680415823701676,
+      "learning_rate": 1.6218807183958295e-06,
+      "loss": 0.5551,
+      "step": 2951
+    },
+    {
+      "epoch": 0.94464,
+      "grad_norm": 0.3314338753479147,
+      "learning_rate": 1.6033418155856794e-06,
+      "loss": 0.5415,
+      "step": 2952
+    },
+    {
+      "epoch": 0.94496,
+      "grad_norm": 0.38456158513215843,
+      "learning_rate": 1.584908620855019e-06,
+      "loss": 0.5877,
+      "step": 2953
+    },
+    {
+      "epoch": 0.94528,
+      "grad_norm": 0.35445987277445967,
+      "learning_rate": 1.5665811540067409e-06,
+      "loss": 0.5378,
+      "step": 2954
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.34834016754937597,
+      "learning_rate": 1.54835943473014e-06,
+      "loss": 0.566,
+      "step": 2955
+    },
+    {
+      "epoch": 0.94592,
+      "grad_norm": 0.35124995819587745,
+      "learning_rate": 1.5302434826009349e-06,
+      "loss": 0.5474,
+      "step": 2956
+    },
+    {
+      "epoch": 0.94624,
+      "grad_norm": 0.34220984783938785,
+      "learning_rate": 1.5122333170811576e-06,
+      "loss": 0.5474,
+      "step": 2957
+    },
+    {
+      "epoch": 0.94656,
+      "grad_norm": 0.34712881756806974,
+      "learning_rate": 1.4943289575192421e-06,
+      "loss": 0.5703,
+      "step": 2958
+    },
+    {
+      "epoch": 0.94688,
+      "grad_norm": 0.3502499119432055,
+      "learning_rate": 1.4765304231499578e-06,
+      "loss": 0.5774,
+      "step": 2959
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3513076058690715,
+      "learning_rate": 1.4588377330943536e-06,
+      "loss": 0.5874,
+      "step": 2960
+    },
+    {
+      "epoch": 0.94752,
+      "grad_norm": 0.3537578390048591,
+      "learning_rate": 1.4412509063598034e-06,
+      "loss": 0.6439,
+      "step": 2961
+    },
+    {
+      "epoch": 0.94784,
+      "grad_norm": 0.3665668483613586,
+      "learning_rate": 1.4237699618399048e-06,
+      "loss": 0.5933,
+      "step": 2962
+    },
+    {
+      "epoch": 0.94816,
+      "grad_norm": 0.35312769657275656,
+      "learning_rate": 1.4063949183145463e-06,
+      "loss": 0.5918,
+      "step": 2963
+    },
+    {
+      "epoch": 0.94848,
+      "grad_norm": 0.3336561524014225,
+      "learning_rate": 1.3891257944498416e-06,
+      "loss": 0.5959,
+      "step": 2964
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.3608145588685783,
+      "learning_rate": 1.3719626087981052e-06,
+      "loss": 0.5368,
+      "step": 2965
+    },
+    {
+      "epoch": 0.94912,
+      "grad_norm": 0.3730366746726397,
+      "learning_rate": 1.354905379797844e-06,
+      "loss": 0.5382,
+      "step": 2966
+    },
+    {
+      "epoch": 0.94944,
+      "grad_norm": 0.32338501285008003,
+      "learning_rate": 1.3379541257737217e-06,
+      "loss": 0.5561,
+      "step": 2967
+    },
+    {
+      "epoch": 0.94976,
+      "grad_norm": 0.3526047077386501,
+      "learning_rate": 1.321108864936571e-06,
+      "loss": 0.5963,
+      "step": 2968
+    },
+    {
+      "epoch": 0.95008,
+      "grad_norm": 0.3960926123725681,
+      "learning_rate": 1.3043696153833717e-06,
+      "loss": 0.6461,
+      "step": 2969
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3421126169413257,
+      "learning_rate": 1.287736395097161e-06,
+      "loss": 0.5622,
+      "step": 2970
+    },
+    {
+      "epoch": 0.95072,
+      "grad_norm": 0.3463639607176593,
+      "learning_rate": 1.2712092219471227e-06,
+      "loss": 0.5815,
+      "step": 2971
+    },
+    {
+      "epoch": 0.95104,
+      "grad_norm": 0.36283792504186074,
+      "learning_rate": 1.2547881136884654e-06,
+      "loss": 0.5378,
+      "step": 2972
+    },
+    {
+      "epoch": 0.95136,
+      "grad_norm": 0.3367440416990651,
+      "learning_rate": 1.2384730879625106e-06,
+      "loss": 0.5747,
+      "step": 2973
+    },
+    {
+      "epoch": 0.95168,
+      "grad_norm": 0.3477123428091511,
+      "learning_rate": 1.2222641622965604e-06,
+      "loss": 0.5391,
+      "step": 2974
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.34504475093450154,
+      "learning_rate": 1.206161354103963e-06,
+      "loss": 0.581,
+      "step": 2975
+    },
+    {
+      "epoch": 0.95232,
+      "grad_norm": 0.3653522597405551,
+      "learning_rate": 1.1901646806840471e-06,
+      "loss": 0.5977,
+      "step": 2976
+    },
+    {
+      "epoch": 0.95264,
+      "grad_norm": 0.3618678438617388,
+      "learning_rate": 1.174274159222133e-06,
+      "loss": 0.6008,
+      "step": 2977
+    },
+    {
+      "epoch": 0.95296,
+      "grad_norm": 0.3596330442437025,
+      "learning_rate": 1.1584898067894867e-06,
+      "loss": 0.6252,
+      "step": 2978
+    },
+    {
+      "epoch": 0.95328,
+      "grad_norm": 0.3320449626840452,
+      "learning_rate": 1.1428116403433554e-06,
+      "loss": 0.5754,
+      "step": 2979
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.33215490910901957,
+      "learning_rate": 1.1272396767268433e-06,
+      "loss": 0.5494,
+      "step": 2980
+    },
+    {
+      "epoch": 0.95392,
+      "grad_norm": 0.34818280951106906,
+      "learning_rate": 1.1117739326690247e-06,
+      "loss": 0.5876,
+      "step": 2981
+    },
+    {
+      "epoch": 0.95424,
+      "grad_norm": 0.347376607901708,
+      "learning_rate": 1.0964144247848197e-06,
+      "loss": 0.5342,
+      "step": 2982
+    },
+    {
+      "epoch": 0.95456,
+      "grad_norm": 0.36345764361124655,
+      "learning_rate": 1.0811611695750513e-06,
+      "loss": 0.5983,
+      "step": 2983
+    },
+    {
+      "epoch": 0.95488,
+      "grad_norm": 0.35966788905303054,
+      "learning_rate": 1.0660141834263449e-06,
+      "loss": 0.5971,
+      "step": 2984
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.32790846520908623,
+      "learning_rate": 1.0509734826112394e-06,
+      "loss": 0.5089,
+      "step": 2985
+    },
+    {
+      "epoch": 0.95552,
+      "grad_norm": 0.36823188901468407,
+      "learning_rate": 1.0360390832879985e-06,
+      "loss": 0.5776,
+      "step": 2986
+    },
+    {
+      "epoch": 0.95584,
+      "grad_norm": 0.33943531308732744,
+      "learning_rate": 1.0212110015007547e-06,
+      "loss": 0.5786,
+      "step": 2987
+    },
+    {
+      "epoch": 0.95616,
+      "grad_norm": 0.34875097707546227,
+      "learning_rate": 1.006489253179388e-06,
+      "loss": 0.5677,
+      "step": 2988
+    },
+    {
+      "epoch": 0.95648,
+      "grad_norm": 0.3455504822690956,
+      "learning_rate": 9.918738541395578e-07,
+      "loss": 0.5526,
+      "step": 2989
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4590995078853539,
+      "learning_rate": 9.773648200826823e-07,
+      "loss": 0.5361,
+      "step": 2990
+    },
+    {
+      "epoch": 0.95712,
+      "grad_norm": 0.3287794338622283,
+      "learning_rate": 9.62962166595882e-07,
+      "loss": 0.6143,
+      "step": 2991
+    },
+    {
+      "epoch": 0.95744,
+      "grad_norm": 0.33925374881083975,
+      "learning_rate": 9.486659091520244e-07,
+      "loss": 0.5597,
+      "step": 2992
+    },
+    {
+      "epoch": 0.95776,
+      "grad_norm": 0.43241540955994434,
+      "learning_rate": 9.344760631096239e-07,
+      "loss": 0.5561,
+      "step": 2993
+    },
+    {
+      "epoch": 0.95808,
+      "grad_norm": 0.3891603979596362,
+      "learning_rate": 9.203926437129528e-07,
+      "loss": 0.5409,
+      "step": 2994
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.38420498082309823,
+      "learning_rate": 9.064156660918865e-07,
+      "loss": 0.6308,
+      "step": 2995
+    },
+    {
+      "epoch": 0.95872,
+      "grad_norm": 0.36953905631438,
+      "learning_rate": 8.925451452619693e-07,
+      "loss": 0.6094,
+      "step": 2996
+    },
+    {
+      "epoch": 0.95904,
+      "grad_norm": 0.3620247581257889,
+      "learning_rate": 8.787810961243814e-07,
+      "loss": 0.6002,
+      "step": 2997
+    },
+    {
+      "epoch": 0.95936,
+      "grad_norm": 0.33706859882714446,
+      "learning_rate": 8.651235334659169e-07,
+      "loss": 0.5974,
+      "step": 2998
+    },
+    {
+      "epoch": 0.95968,
+      "grad_norm": 0.34774307956372846,
+      "learning_rate": 8.515724719589835e-07,
+      "loss": 0.5589,
+      "step": 2999
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.334074929758646,
+      "learning_rate": 8.381279261615471e-07,
+      "loss": 0.5466,
+      "step": 3000
+    },
+    {
+      "epoch": 0.96032,
+      "grad_norm": 0.3610137435702167,
+      "learning_rate": 8.247899105171652e-07,
+      "loss": 0.5521,
+      "step": 3001
+    },
+    {
+      "epoch": 0.96064,
+      "grad_norm": 0.36058569173337224,
+      "learning_rate": 8.115584393549425e-07,
+      "loss": 0.5818,
+      "step": 3002
+    },
+    {
+      "epoch": 0.96096,
+      "grad_norm": 0.35159953169862573,
+      "learning_rate": 7.984335268895193e-07,
+      "loss": 0.5708,
+      "step": 3003
+    },
+    {
+      "epoch": 0.96128,
+      "grad_norm": 0.3423249349751101,
+      "learning_rate": 7.854151872210614e-07,
+      "loss": 0.6078,
+      "step": 3004
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3383356696682219,
+      "learning_rate": 7.725034343352478e-07,
+      "loss": 0.6123,
+      "step": 3005
+    },
+    {
+      "epoch": 0.96192,
+      "grad_norm": 0.3323887843584124,
+      "learning_rate": 7.596982821032494e-07,
+      "loss": 0.5518,
+      "step": 3006
+    },
+    {
+      "epoch": 0.96224,
+      "grad_norm": 0.34685414527044406,
+      "learning_rate": 7.469997442816957e-07,
+      "loss": 0.6,
+      "step": 3007
+    },
+    {
+      "epoch": 0.96256,
+      "grad_norm": 0.3376624771473842,
+      "learning_rate": 7.344078345127292e-07,
+      "loss": 0.5722,
+      "step": 3008
+    },
+    {
+      "epoch": 0.96288,
+      "grad_norm": 0.3439905129336635,
+      "learning_rate": 7.219225663238738e-07,
+      "loss": 0.5529,
+      "step": 3009
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3399654458516724,
+      "learning_rate": 7.095439531281556e-07,
+      "loss": 0.5803,
+      "step": 3010
+    },
+    {
+      "epoch": 0.96352,
+      "grad_norm": 0.33972260289665,
+      "learning_rate": 6.972720082239481e-07,
+      "loss": 0.5828,
+      "step": 3011
+    },
+    {
+      "epoch": 0.96384,
+      "grad_norm": 0.32789095864566037,
+      "learning_rate": 6.851067447951054e-07,
+      "loss": 0.5288,
+      "step": 3012
+    },
+    {
+      "epoch": 0.96416,
+      "grad_norm": 0.32753837536877084,
+      "learning_rate": 6.730481759108287e-07,
+      "loss": 0.552,
+      "step": 3013
+    },
+    {
+      "epoch": 0.96448,
+      "grad_norm": 0.3247709604757158,
+      "learning_rate": 6.610963145256999e-07,
+      "loss": 0.5195,
+      "step": 3014
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.42050237723938133,
+      "learning_rate": 6.492511734796702e-07,
+      "loss": 0.5685,
+      "step": 3015
+    },
+    {
+      "epoch": 0.96512,
+      "grad_norm": 0.32661375396326997,
+      "learning_rate": 6.375127654980495e-07,
+      "loss": 0.5581,
+      "step": 3016
+    },
+    {
+      "epoch": 0.96544,
+      "grad_norm": 0.38819378922556824,
+      "learning_rate": 6.258811031914613e-07,
+      "loss": 0.6018,
+      "step": 3017
+    },
+    {
+      "epoch": 0.96576,
+      "grad_norm": 0.323920209103656,
+      "learning_rate": 6.143561990558877e-07,
+      "loss": 0.5423,
+      "step": 3018
+    },
+    {
+      "epoch": 0.96608,
+      "grad_norm": 0.3532360252455809,
+      "learning_rate": 6.029380654725691e-07,
+      "loss": 0.5779,
+      "step": 3019
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.33802407087018976,
+      "learning_rate": 5.916267147080934e-07,
+      "loss": 0.5869,
+      "step": 3020
+    },
+    {
+      "epoch": 0.96672,
+      "grad_norm": 0.3338179461751494,
+      "learning_rate": 5.804221589142955e-07,
+      "loss": 0.5722,
+      "step": 3021
+    },
+    {
+      "epoch": 0.96704,
+      "grad_norm": 0.3325262151259706,
+      "learning_rate": 5.693244101282913e-07,
+      "loss": 0.5459,
+      "step": 3022
+    },
+    {
+      "epoch": 0.96736,
+      "grad_norm": 0.36110785149431884,
+      "learning_rate": 5.583334802724661e-07,
+      "loss": 0.589,
+      "step": 3023
+    },
+    {
+      "epoch": 0.96768,
+      "grad_norm": 0.34241363205791653,
+      "learning_rate": 5.474493811544301e-07,
+      "loss": 0.538,
+      "step": 3024
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3476167965285925,
+      "learning_rate": 5.366721244670303e-07,
+      "loss": 0.5898,
+      "step": 3025
+    },
+    {
+      "epoch": 0.96832,
+      "grad_norm": 0.34763030822207996,
+      "learning_rate": 5.260017217883273e-07,
+      "loss": 0.5807,
+      "step": 3026
+    },
+    {
+      "epoch": 0.96864,
+      "grad_norm": 0.35025748580491395,
+      "learning_rate": 5.15438184581607e-07,
+      "loss": 0.5733,
+      "step": 3027
+    },
+    {
+      "epoch": 0.96896,
+      "grad_norm": 0.32257374496859864,
+      "learning_rate": 5.04981524195347e-07,
+      "loss": 0.5356,
+      "step": 3028
+    },
+    {
+      "epoch": 0.96928,
+      "grad_norm": 0.34391409073006424,
+      "learning_rate": 4.946317518631616e-07,
+      "loss": 0.5361,
+      "step": 3029
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3412407552958236,
+      "learning_rate": 4.843888787039009e-07,
+      "loss": 0.5774,
+      "step": 3030
+    },
+    {
+      "epoch": 0.96992,
+      "grad_norm": 0.35401081087623015,
+      "learning_rate": 4.742529157215181e-07,
+      "loss": 0.5344,
+      "step": 3031
+    },
+    {
+      "epoch": 0.97024,
+      "grad_norm": 0.3424814726842407,
+      "learning_rate": 4.642238738051474e-07,
+      "loss": 0.5904,
+      "step": 3032
+    },
+    {
+      "epoch": 0.97056,
+      "grad_norm": 0.3540963795440962,
+      "learning_rate": 4.54301763729037e-07,
+      "loss": 0.5674,
+      "step": 3033
+    },
+    {
+      "epoch": 0.97088,
+      "grad_norm": 0.35331654650540045,
+      "learning_rate": 4.4448659615258236e-07,
+      "loss": 0.5615,
+      "step": 3034
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3343292615605365,
+      "learning_rate": 4.3477838162024886e-07,
+      "loss": 0.5717,
+      "step": 3035
+    },
+    {
+      "epoch": 0.97152,
+      "grad_norm": 0.3576719247538726,
+      "learning_rate": 4.251771305616381e-07,
+      "loss": 0.6138,
+      "step": 3036
+    },
+    {
+      "epoch": 0.97184,
+      "grad_norm": 0.33256575491615603,
+      "learning_rate": 4.1568285329143254e-07,
+      "loss": 0.5943,
+      "step": 3037
+    },
+    {
+      "epoch": 0.97216,
+      "grad_norm": 0.3574334143465092,
+      "learning_rate": 4.062955600093732e-07,
+      "loss": 0.5547,
+      "step": 3038
+    },
+    {
+      "epoch": 0.97248,
+      "grad_norm": 0.35457194320985824,
+      "learning_rate": 3.9701526080029304e-07,
+      "loss": 0.5894,
+      "step": 3039
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.33249794363493285,
+      "learning_rate": 3.878419656340726e-07,
+      "loss": 0.5951,
+      "step": 3040
+    },
+    {
+      "epoch": 0.97312,
+      "grad_norm": 0.3538138734380768,
+      "learning_rate": 3.7877568436562873e-07,
+      "loss": 0.574,
+      "step": 3041
+    },
+    {
+      "epoch": 0.97344,
+      "grad_norm": 0.36469331123021703,
+      "learning_rate": 3.698164267349036e-07,
+      "loss": 0.6239,
+      "step": 3042
+    },
+    {
+      "epoch": 0.97376,
+      "grad_norm": 0.3443538502837614,
+      "learning_rate": 3.60964202366898e-07,
+      "loss": 0.5874,
+      "step": 3043
+    },
+    {
+      "epoch": 0.97408,
+      "grad_norm": 0.3878681209013827,
+      "learning_rate": 3.522190207716047e-07,
+      "loss": 0.5756,
+      "step": 3044
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3472484826445253,
+      "learning_rate": 3.4358089134400863e-07,
+      "loss": 0.571,
+      "step": 3045
+    },
+    {
+      "epoch": 0.97472,
+      "grad_norm": 0.3385241997060716,
+      "learning_rate": 3.350498233641086e-07,
+      "loss": 0.5653,
+      "step": 3046
+    },
+    {
+      "epoch": 0.97504,
+      "grad_norm": 0.34946707608166916,
+      "learning_rate": 3.266258259968846e-07,
+      "loss": 0.577,
+      "step": 3047
+    },
+    {
+      "epoch": 0.97536,
+      "grad_norm": 0.32779632967965444,
+      "learning_rate": 3.183089082922641e-07,
+      "loss": 0.5558,
+      "step": 3048
+    },
+    {
+      "epoch": 0.97568,
+      "grad_norm": 0.38213428342768524,
+      "learning_rate": 3.1009907918518877e-07,
+      "loss": 0.5653,
+      "step": 3049
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3277835093048738,
+      "learning_rate": 3.019963474954923e-07,
+      "loss": 0.5925,
+      "step": 3050
+    },
+    {
+      "epoch": 0.97632,
+      "grad_norm": 0.36935821223351273,
+      "learning_rate": 2.9400072192800054e-07,
+      "loss": 0.6103,
+      "step": 3051
+    },
+    {
+      "epoch": 0.97664,
+      "grad_norm": 0.34596844325611154,
+      "learning_rate": 2.8611221107246455e-07,
+      "loss": 0.5707,
+      "step": 3052
+    },
+    {
+      "epoch": 0.97696,
+      "grad_norm": 0.34797252141198665,
+      "learning_rate": 2.7833082340353867e-07,
+      "loss": 0.5523,
+      "step": 3053
+    },
+    {
+      "epoch": 0.97728,
+      "grad_norm": 0.33594949599780155,
+      "learning_rate": 2.706565672808248e-07,
+      "loss": 0.5574,
+      "step": 3054
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3626534604748479,
+      "learning_rate": 2.630894509488058e-07,
+      "loss": 0.5947,
+      "step": 3055
+    },
+    {
+      "epoch": 0.97792,
+      "grad_norm": 0.3393564628713558,
+      "learning_rate": 2.55629482536901e-07,
+      "loss": 0.5682,
+      "step": 3056
+    },
+    {
+      "epoch": 0.97824,
+      "grad_norm": 0.3456214471911362,
+      "learning_rate": 2.482766700593664e-07,
+      "loss": 0.5519,
+      "step": 3057
+    },
+    {
+      "epoch": 0.97856,
+      "grad_norm": 0.41635533270319847,
+      "learning_rate": 2.4103102141539436e-07,
+      "loss": 0.5752,
+      "step": 3058
+    },
+    {
+      "epoch": 0.97888,
+      "grad_norm": 0.3445549388275251,
+      "learning_rate": 2.3389254438901386e-07,
+      "loss": 0.5574,
+      "step": 3059
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.33765553083777006,
+      "learning_rate": 2.2686124664912378e-07,
+      "loss": 0.6115,
+      "step": 3060
+    },
+    {
+      "epoch": 0.97952,
+      "grad_norm": 0.3424257364579159,
+      "learning_rate": 2.199371357495039e-07,
+      "loss": 0.5889,
+      "step": 3061
+    },
+    {
+      "epoch": 0.97984,
+      "grad_norm": 0.35676989267410675,
+      "learning_rate": 2.1312021912875956e-07,
+      "loss": 0.5679,
+      "step": 3062
+    },
+    {
+      "epoch": 0.98016,
+      "grad_norm": 0.3605283767238318,
+      "learning_rate": 2.064105041103326e-07,
+      "loss": 0.6012,
+      "step": 3063
+    },
+    {
+      "epoch": 0.98048,
+      "grad_norm": 0.3491057795652527,
+      "learning_rate": 1.9980799790251247e-07,
+      "loss": 0.5783,
+      "step": 3064
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4055076865265235,
+      "learning_rate": 1.9331270759840313e-07,
+      "loss": 0.5831,
+      "step": 3065
+    },
+    {
+      "epoch": 0.98112,
+      "grad_norm": 0.356913503502491,
+      "learning_rate": 1.8692464017594503e-07,
+      "loss": 0.6052,
+      "step": 3066
+    },
+    {
+      "epoch": 0.98144,
+      "grad_norm": 0.36645817945871173,
+      "learning_rate": 1.8064380249787073e-07,
+      "loss": 0.5955,
+      "step": 3067
+    },
+    {
+      "epoch": 0.98176,
+      "grad_norm": 0.33668101057489536,
+      "learning_rate": 1.744702013117161e-07,
+      "loss": 0.5877,
+      "step": 3068
+    },
+    {
+      "epoch": 0.98208,
+      "grad_norm": 0.33554858337596616,
+      "learning_rate": 1.6840384324980917e-07,
+      "loss": 0.5149,
+      "step": 3069
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3517379456116306,
+      "learning_rate": 1.6244473482929236e-07,
+      "loss": 0.5725,
+      "step": 3070
+    },
+    {
+      "epoch": 0.98272,
+      "grad_norm": 0.3542041241590634,
+      "learning_rate": 1.5659288245204462e-07,
+      "loss": 0.599,
+      "step": 3071
+    },
+    {
+      "epoch": 0.98304,
+      "grad_norm": 0.34010650493999023,
+      "learning_rate": 1.508482924047483e-07,
+      "loss": 0.545,
+      "step": 3072
+    },
+    {
+      "epoch": 0.98336,
+      "grad_norm": 0.34643566377072554,
+      "learning_rate": 1.452109708588667e-07,
+      "loss": 0.5968,
+      "step": 3073
+    },
+    {
+      "epoch": 0.98368,
+      "grad_norm": 0.35445576952004687,
+      "learning_rate": 1.3968092387057763e-07,
+      "loss": 0.5543,
+      "step": 3074
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.35937402681947833,
+      "learning_rate": 1.342581573808732e-07,
+      "loss": 0.5788,
+      "step": 3075
+    },
+    {
+      "epoch": 0.98432,
+      "grad_norm": 0.35937832592448776,
+      "learning_rate": 1.2894267721543784e-07,
+      "loss": 0.5749,
+      "step": 3076
+    },
+    {
+      "epoch": 0.98464,
+      "grad_norm": 0.3362818494664272,
+      "learning_rate": 1.2373448908473695e-07,
+      "loss": 0.5743,
+      "step": 3077
+    },
+    {
+      "epoch": 0.98496,
+      "grad_norm": 0.3718921725009722,
+      "learning_rate": 1.186335985839393e-07,
+      "loss": 0.5452,
+      "step": 3078
+    },
+    {
+      "epoch": 0.98528,
+      "grad_norm": 0.33374310850780264,
+      "learning_rate": 1.1364001119298362e-07,
+      "loss": 0.5157,
+      "step": 3079
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.32540387502014295,
+      "learning_rate": 1.0875373227647866e-07,
+      "loss": 0.5522,
+      "step": 3080
+    },
+    {
+      "epoch": 0.98592,
+      "grad_norm": 0.35199700312823773,
+      "learning_rate": 1.0397476708380315e-07,
+      "loss": 0.5559,
+      "step": 3081
+    },
+    {
+      "epoch": 0.98624,
+      "grad_norm": 0.328713406921307,
+      "learning_rate": 9.930312074902803e-08,
+      "loss": 0.5604,
+      "step": 3082
+    },
+    {
+      "epoch": 0.98656,
+      "grad_norm": 0.39051777192109,
+      "learning_rate": 9.473879829091648e-08,
+      "loss": 0.5711,
+      "step": 3083
+    },
+    {
+      "epoch": 0.98688,
+      "grad_norm": 0.3559927552619233,
+      "learning_rate": 9.02818046129461e-08,
+      "loss": 0.5886,
+      "step": 3084
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.35322712440811577,
+      "learning_rate": 8.59321445032979e-08,
+      "loss": 0.5606,
+      "step": 3085
+    },
+    {
+      "epoch": 0.98752,
+      "grad_norm": 0.3455829112476088,
+      "learning_rate": 8.168982263483394e-08,
+      "loss": 0.5584,
+      "step": 3086
+    },
+    {
+      "epoch": 0.98784,
+      "grad_norm": 0.3605758720130785,
+      "learning_rate": 7.755484356509746e-08,
+      "loss": 0.5659,
+      "step": 3087
+    },
+    {
+      "epoch": 0.98816,
+      "grad_norm": 0.36406820541827006,
+      "learning_rate": 7.352721173633504e-08,
+      "loss": 0.6191,
+      "step": 3088
+    },
+    {
+      "epoch": 0.98848,
+      "grad_norm": 0.337989920536971,
+      "learning_rate": 6.960693147542996e-08,
+      "loss": 0.5625,
+      "step": 3089
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.36405275356887806,
+      "learning_rate": 6.579400699397998e-08,
+      "loss": 0.5945,
+      "step": 3090
+    },
+    {
+      "epoch": 0.98912,
+      "grad_norm": 0.35175253541383983,
+      "learning_rate": 6.208844238823064e-08,
+      "loss": 0.534,
+      "step": 3091
+    },
+    {
+      "epoch": 0.98944,
+      "grad_norm": 0.32270622427841283,
+      "learning_rate": 5.849024163908645e-08,
+      "loss": 0.5522,
+      "step": 3092
+    },
+    {
+      "epoch": 0.98976,
+      "grad_norm": 0.3267607854731609,
+      "learning_rate": 5.4999408612110834e-08,
+      "loss": 0.5065,
+      "step": 3093
+    },
+    {
+      "epoch": 0.99008,
+      "grad_norm": 0.333087602601193,
+      "learning_rate": 5.161594705753725e-08,
+      "loss": 0.5683,
+      "step": 3094
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.34012888786014306,
+      "learning_rate": 4.833986061022477e-08,
+      "loss": 0.5621,
+      "step": 3095
+    },
+    {
+      "epoch": 0.99072,
+      "grad_norm": 0.341411041628581,
+      "learning_rate": 4.517115278969142e-08,
+      "loss": 0.5392,
+      "step": 3096
+    },
+    {
+      "epoch": 0.99104,
+      "grad_norm": 0.31678780309177473,
+      "learning_rate": 4.210982700010302e-08,
+      "loss": 0.5672,
+      "step": 3097
+    },
+    {
+      "epoch": 0.99136,
+      "grad_norm": 0.32613069832826314,
+      "learning_rate": 3.915588653026214e-08,
+      "loss": 0.5624,
+      "step": 3098
+    },
+    {
+      "epoch": 0.99168,
+      "grad_norm": 0.3388788089473714,
+      "learning_rate": 3.6309334553596976e-08,
+      "loss": 0.5509,
+      "step": 3099
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.34094078498765784,
+      "learning_rate": 3.357017412817243e-08,
+      "loss": 0.6102,
+      "step": 3100
+    },
+    {
+      "epoch": 0.99232,
+      "grad_norm": 0.33001286343331304,
+      "learning_rate": 3.0938408196690136e-08,
+      "loss": 0.5729,
+      "step": 3101
+    },
+    {
+      "epoch": 0.99264,
+      "grad_norm": 0.33891470644383886,
+      "learning_rate": 2.841403958647737e-08,
+      "loss": 0.536,
+      "step": 3102
+    },
+    {
+      "epoch": 0.99296,
+      "grad_norm": 0.3534016909798925,
+      "learning_rate": 2.59970710094537e-08,
+      "loss": 0.6065,
+      "step": 3103
+    },
+    {
+      "epoch": 0.99328,
+      "grad_norm": 0.33991779310580084,
+      "learning_rate": 2.3687505062208738e-08,
+      "loss": 0.553,
+      "step": 3104
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3254908024625272,
+      "learning_rate": 2.1485344225902205e-08,
+      "loss": 0.532,
+      "step": 3105
+    },
+    {
+      "epoch": 0.99392,
+      "grad_norm": 0.34983334152372686,
+      "learning_rate": 1.9390590866341652e-08,
+      "loss": 0.5995,
+      "step": 3106
+    },
+    {
+      "epoch": 0.99424,
+      "grad_norm": 0.3537433105206389,
+      "learning_rate": 1.7403247233926945e-08,
+      "loss": 0.5993,
+      "step": 3107
+    },
+    {
+      "epoch": 0.99456,
+      "grad_norm": 0.34247688714640284,
+      "learning_rate": 1.5523315463672473e-08,
+      "loss": 0.556,
+      "step": 3108
+    },
+    {
+      "epoch": 0.99488,
+      "grad_norm": 0.36795006478868114,
+      "learning_rate": 1.375079757519604e-08,
+      "loss": 0.6431,
+      "step": 3109
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3617742501927704,
+      "learning_rate": 1.2085695472729975e-08,
+      "loss": 0.5835,
+      "step": 3110
+    },
+    {
+      "epoch": 0.99552,
+      "grad_norm": 0.35701364706114147,
+      "learning_rate": 1.0528010945098921e-08,
+      "loss": 0.6025,
+      "step": 3111
+    },
+    {
+      "epoch": 0.99584,
+      "grad_norm": 0.3742832929399474,
+      "learning_rate": 9.077745665730941e-09,
+      "loss": 0.604,
+      "step": 3112
+    },
+    {
+      "epoch": 0.99616,
+      "grad_norm": 0.36316691586090294,
+      "learning_rate": 7.734901192657517e-09,
+      "loss": 0.5374,
+      "step": 3113
+    },
+    {
+      "epoch": 0.99648,
+      "grad_norm": 0.3322737274559622,
+      "learning_rate": 6.499478968502448e-09,
+      "loss": 0.5394,
+      "step": 3114
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.3594354091921863,
+      "learning_rate": 5.371480320481848e-09,
+      "loss": 0.5727,
+      "step": 3115
+    },
+    {
+      "epoch": 0.99712,
+      "grad_norm": 0.35871374527749056,
+      "learning_rate": 4.35090646041525e-09,
+      "loss": 0.5504,
+      "step": 3116
+    },
+    {
+      "epoch": 0.99744,
+      "grad_norm": 0.38832115662509037,
+      "learning_rate": 3.437758484714504e-09,
+      "loss": 0.5861,
+      "step": 3117
+    },
+    {
+      "epoch": 0.99776,
+      "grad_norm": 0.3561515423446857,
+      "learning_rate": 2.6320373743837777e-09,
+      "loss": 0.584,
+      "step": 3118
+    },
+    {
+      "epoch": 0.99808,
+      "grad_norm": 0.3605520935287755,
+      "learning_rate": 1.9337439949973502e-09,
+      "loss": 0.5807,
+      "step": 3119
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.433149334004452,
+      "learning_rate": 1.3428790967440208e-09,
+      "loss": 0.6034,
+      "step": 3120
+    },
+    {
+      "epoch": 0.99872,
+      "grad_norm": 0.39777144062066955,
+      "learning_rate": 8.594433143938041e-10,
+      "loss": 0.5864,
+      "step": 3121
+    },
+    {
+      "epoch": 0.99904,
+      "grad_norm": 0.509715795086994,
+      "learning_rate": 4.83437167309031e-10,
+      "loss": 0.5699,
+      "step": 3122
+    },
+    {
+      "epoch": 0.99936,
+      "grad_norm": 0.33267505866821645,
+      "learning_rate": 2.148610594221445e-10,
+      "loss": 0.5725,
+      "step": 3123
+    },
+    {
+      "epoch": 0.99968,
+      "grad_norm": 0.3562465475370115,
+      "learning_rate": 5.3715279280108777e-11,
+      "loss": 0.5751,
+      "step": 3124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3552379570399888,
+      "learning_rate": 0.0,
+      "loss": 0.5645,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 2770985822846976.0,
+      "train_loss": 0.6393652757930756,
+      "train_runtime": 50013.1059,
+      "train_samples_per_second": 1.0,
+      "train_steps_per_second": 0.062
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2770985822846976.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..932034178d574272add53b52f3aad9cfdef411d0
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..628d6aa4457929c0fb80c641ebfd0ced124e6bf3
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae95ba94a4f8f38e236e28a4ee58be7941b804709c60c205490bc088ba9943cb
+size 671150064
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fd49991c44b4d87bd118b70f160b75978e43b6bb
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa722027eae02f64df4708c7015be75ac9446545d1383e5548676926c114f8f1
+size 918507402
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..157b51da3e7c5591ff3a4625eb4527f3a1010a02
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.2798540748627416,
+      "learning_rate": 2e-05,
+      "loss": 1.5555,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.1130663775030645,
+      "learning_rate": 4e-05,
+      "loss": 1.5566,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.1165879934971559,
+      "learning_rate": 6e-05,
+      "loss": 1.4623,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.0058575486770314,
+      "learning_rate": 8e-05,
+      "loss": 1.3626,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.219011347258107,
+      "learning_rate": 0.0001,
+      "loss": 1.2277,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.375374057893773,
+      "learning_rate": 0.00012,
+      "loss": 1.0392,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.9163969142049067,
+      "learning_rate": 0.00014,
+      "loss": 1.0107,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7529784242992968,
+      "learning_rate": 0.00016,
+      "loss": 0.9169,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6095581532563064,
+      "learning_rate": 0.00018,
+      "loss": 0.8641,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5462052358272893,
+      "learning_rate": 0.0002,
+      "loss": 0.8992,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5157666896476973,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9204,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5002919821711009,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8508,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5584305955660048,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.8616,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5118169294900369,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.8525,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5037775467974861,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.8293,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5305323185219055,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.92,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.49521462897177754,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8508,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.8517689323701216,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8561,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.462010867053101,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8378,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.42475383178016946,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8345,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4318454627640549,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8076,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4356633406944935,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8393,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.42037201194141616,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8468,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.44369315732198766,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8319,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5254814555076129,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8612,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4744180205622037,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8749,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.46898004305474095,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.784,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4623791910998875,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.84,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4366697265960226,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.797,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.44423044794779787,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8258,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.42626300065856604,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8326,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4193465122697224,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8412,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.40681649254496394,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.7896,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.41612527711854297,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.778,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.39751435632214827,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8441,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4077296296795258,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8604,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4180398678714927,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8307,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.41438332644147524,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8027,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4134687870277017,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.7947,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3843193571282607,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.7355,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.41552762355305467,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.8159,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.41784625055790997,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.7917,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.38524871144404454,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.762,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.40557699664979796,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8135,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.40666034746303753,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.8029,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.39051816926073535,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8264,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.37829129722754057,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7413,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4220307185970994,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8492,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.3778753770849601,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.7066,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.374888863553464,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.7368,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.39133935091911604,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.724,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.40002567971895214,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7878,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.42178931968695155,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8216,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.40860442858917406,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.7551,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.39852413957814403,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7644,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4189572790433841,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8023,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.40855887030389637,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.7531,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3801608851366451,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7723,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4139301030131194,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.825,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4017543169947211,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8496,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.44378337867320256,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8018,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.41300498833124244,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.7794,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.40657492525865646,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.7831,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.395465279461956,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7721,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.42703491839853985,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.7692,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.38479406258477233,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7739,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4047609139504125,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.7813,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4020033275712984,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7504,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3884606066242859,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7561,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3934207199977672,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7415,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3873179960204124,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.7912,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.37693458798587726,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7326,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4061678138034663,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.7822,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3892051655151776,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7584,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.38310012668403887,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7867,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3784062031016005,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7316,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3907279184041235,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7596,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4139029656655114,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.8192,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4157507841723872,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7086,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.39936633784499215,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7374,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4238196965131133,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.7223,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3767284979400297,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7621,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.40780204061722725,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7928,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3948337750910549,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7786,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4082750981486944,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7655,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3817389147753865,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7724,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4008774240859956,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.7723,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3788953655261612,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7652,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.425366550485755,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.7428,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.44329184148884065,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.7772,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.46025119810711906,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7935,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.37737275103783063,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.6967,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.42706714197880036,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7906,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.45563949250899594,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7696,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3776185851415028,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7572,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.36984631782078564,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.705,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4346876784878423,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7416,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.36934331041539875,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7529,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4081247719533251,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7242,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.40855001328336676,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7346,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3864667377219663,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7443,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.41280954364426664,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7136,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3634883141450418,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7121,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.395695601758825,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7455,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3643597037537018,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7331,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.38169101775305364,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.783,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.37909138793115676,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7507,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3687030856036655,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.784,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.38865913887000614,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7235,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.38487452507771075,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7379,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3890186367456905,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7081,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.39434783916904126,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7741,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4536991410557638,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7883,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3874086143124245,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7248,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4140420014704873,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.7656,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.37306782116254605,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7952,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3707950552859267,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7108,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.38928915678585724,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7361,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3608998439607155,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.683,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.39931417035035704,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7405,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3587369475012143,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.706,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3750960748163733,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.754,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.36451783245977326,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7736,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.38677483486032344,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7508,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3744920171166124,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7312,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3644901776487304,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7111,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3661180746712227,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.6941,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3850909914727337,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.6558,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.38408294553109434,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7566,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3798445105114034,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7327,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.37477045736590797,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7466,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3602184650700641,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7361,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.37286154538507166,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7151,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.36266055772384015,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7254,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.37012911200493753,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7276,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3611107229784553,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7159,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.36198737541944453,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.6928,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.36055521300792104,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.6529,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.36079112047603207,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7311,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.38222230627087117,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7788,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3583196360529785,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7411,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3693156445504609,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7223,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5253433057283227,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7243,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.39667834455785345,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7103,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3609170146865518,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.6811,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.36480685243748595,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7154,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.34362916664265325,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7292,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3831351941947736,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.772,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.37678900834449036,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7076,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.35821903152379814,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7138,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.36335736308223704,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.7016,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.37001387570803446,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7365,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.34053687246539766,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7046,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3480581382643813,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7043,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3622932003356663,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7204,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3638401170001878,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.6955,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.35897583152772905,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7112,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.36592350625047265,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7081,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.35759204884810725,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.737,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3629868550931301,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7238,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.37705761812761196,
+      "learning_rate": 0.0001,
+      "loss": 0.7096,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.36707266251522613,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.6896,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.41185970441538455,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7107,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.35197588060842433,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.6446,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.38227151419378474,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.6942,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.370823957452414,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.684,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3626666194137186,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.6576,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.39717099148907603,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.6711,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3547429214150075,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.6925,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.35141948293081565,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.728,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.3807425547952564,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7231,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.370750070649178,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7071,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.36587979218630007,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.6576,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3756124782863467,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7395,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3454848482350315,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7006,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3797633014006375,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7259,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3707990015793451,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.6494,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3359321829538724,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.6849,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4042981638968137,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.681,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3350977200257071,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.6663,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3935415912212897,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7659,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.36189997876991004,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.6555,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.35448506517236716,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7298,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.35106530961144455,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.6677,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.37542595427026854,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.6948,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.36209283497638994,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.6903,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3623486443522771,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7069,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3531878449805604,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7003,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.36007187478065084,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7155,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3526918728137452,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6676,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3505800871386953,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.6715,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3554299911235001,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.6703,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3627855610891972,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.6662,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3642488950050494,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7045,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3451846156452362,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.6755,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.35162369090454626,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7042,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.33039628040857877,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.6818,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.36874697217925245,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.729,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.35504449153403855,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.736,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3505320135825363,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7133,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3340617641467672,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.6806,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3578847181945257,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.6431,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3489695240441819,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.6833,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3506625777470091,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7078,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3474445070349821,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.6889,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.32920237707492495,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.6646,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3334480843491693,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.6501,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3505039143876203,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7206,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3222047814682131,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.6638,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4160786394442177,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7626,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.34935480168303545,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.6544,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3690962579344299,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7605,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.34372036007065915,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.706,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.35913661977043404,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.6788,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3726112942467802,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7114,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.33458843319227594,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.6373,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.34431466922844606,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6884,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3478733203429474,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.6962,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.33823956689505236,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.6658,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.35445169110428304,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7168,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3659639754863401,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.6614,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.358233978795873,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.6772,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.35303470161619455,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6174,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3370222333384653,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.6602,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.36422659550593695,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7251,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3834308718279013,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.6898,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3644135006275037,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.6253,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3370974447420048,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.6417,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.34431625767746926,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.678,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.34551979978352365,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.6871,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3545045862376799,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.6453,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.32066403890900497,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.6707,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.36734608391442103,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.6923,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3547951302678682,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.6761,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5267474819572611,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.6813,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.350204219412763,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7204,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.346890632229907,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6814,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3865269459158674,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.6655,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3564641400714697,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.6694,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.34198572845873776,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7187,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3558838016655507,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6815,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3284662213794393,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6463,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.38476695777550485,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.7077,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.354596469884446,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7029,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3317849510157482,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.6491,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.36083308267005093,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6873,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3409002629967509,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.6948,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3427746040952621,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.6543,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3410742412711408,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6644,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.34794942732730494,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.6971,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3577136892277511,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.6797,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.38624710187607797,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.6671,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3486373264972873,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7327,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3387786699029933,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6671,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.36571687228097544,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6798,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3588033229242822,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7403,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.336709736386362,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6639,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.391641085443981,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.6474,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.34919556737995416,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.6833,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.34360019996565294,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.6711,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.33826189609957946,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.6549,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.33743588149216924,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.658,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.38110422068372724,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.6688,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3538995730098079,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7222,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3318938752544763,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6636,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3447082958846042,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.636,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.35066462511198726,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.6468,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.37196499706566954,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.6735,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3527260375754371,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.6666,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3513942177687219,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6958,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3217678276961582,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.6256,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.32770407204810276,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.6491,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.32765559383846565,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.6773,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3794195535065363,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.6887,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.37106110605162884,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.6741,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.35683941894435456,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6891,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.35203533584308394,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6577,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.337996015194583,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.6332,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.35983244647991486,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7085,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.36214840887826166,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6921,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3585103763875682,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7095,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.34719063880984175,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3950007422079345,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6532,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3588343725626154,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.68,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.35916560967802036,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7225,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3532074162031371,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6726,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3684089516670794,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.6947,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3505598227413042,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.6524,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3480286691290391,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.699,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3391814941929011,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.6357,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3335239287966289,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.6366,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3385790402860968,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6937,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3657441278236026,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6638,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.36281028704902224,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.6588,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3436454356363298,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.6811,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3618953503973711,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7065,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.35426892334235555,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6672,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.34139279273687584,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6287,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3760086821701436,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.6999,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3508003120606531,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.6651,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.35184452760607765,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.6346,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.38870947092895375,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.6777,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.330375968306909,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.6571,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.33472574510023134,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6433,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.33577366769732253,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.6649,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.39028037485639394,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.657,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3552212376429523,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.6521,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3481211606712931,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6883,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.39425813369040086,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.762,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.320909110748154,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.6094,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3604948954854283,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6643,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.336643150141937,
+      "learning_rate": 0.0,
+      "loss": 0.7235,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 276143348056064.0,
+      "train_loss": 0.7382912295751083,
+      "train_runtime": 4947.3523,
+      "train_samples_per_second": 1.011,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 276143348056064.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..530a8d474ec8062e65791555dd78697f3fcb36b2
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e978324e8a0d889186468958049ed44d7862acea
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:372f73f96faf018a759b81a84a2a496fead382c96067b1a3e929827d989900dc
+size 671150064
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b0fcbccdbaca7843a841ca9ed8c3f221aa639ac3
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6eedbab52c520cde50cc465fb01eb8045c76bae390d610723aace3f74fe1a71e
+size 918507402
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a622ce67291f4181cb2dd5b8c0e769b779173e4b
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,26292 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3750,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0002666666666666667,
+      "grad_norm": 1.264853200757946,
+      "learning_rate": 1.7699115044247788e-06,
+      "loss": 1.5974,
+      "step": 1
+    },
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 1.217457875450062,
+      "learning_rate": 3.5398230088495575e-06,
+      "loss": 1.4732,
+      "step": 2
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.259966139048936,
+      "learning_rate": 5.3097345132743365e-06,
+      "loss": 1.5691,
+      "step": 3
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.2505240398961883,
+      "learning_rate": 7.079646017699115e-06,
+      "loss": 1.5631,
+      "step": 4
+    },
+    {
+      "epoch": 0.0013333333333333333,
+      "grad_norm": 1.1873132467629952,
+      "learning_rate": 8.849557522123894e-06,
+      "loss": 1.5566,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.083024669118219,
+      "learning_rate": 1.0619469026548673e-05,
+      "loss": 1.514,
+      "step": 6
+    },
+    {
+      "epoch": 0.0018666666666666666,
+      "grad_norm": 0.9670086918795727,
+      "learning_rate": 1.2389380530973452e-05,
+      "loss": 1.496,
+      "step": 7
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 1.034225839852347,
+      "learning_rate": 1.415929203539823e-05,
+      "loss": 1.4379,
+      "step": 8
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.9507313498380748,
+      "learning_rate": 1.592920353982301e-05,
+      "loss": 1.3717,
+      "step": 9
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.983287200714607,
+      "learning_rate": 1.7699115044247787e-05,
+      "loss": 1.3216,
+      "step": 10
+    },
+    {
+      "epoch": 0.0029333333333333334,
+      "grad_norm": 0.9080214189446323,
+      "learning_rate": 1.946902654867257e-05,
+      "loss": 1.2906,
+      "step": 11
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.999121539250498,
+      "learning_rate": 2.1238938053097346e-05,
+      "loss": 1.1628,
+      "step": 12
+    },
+    {
+      "epoch": 0.0034666666666666665,
+      "grad_norm": 0.9694957107789066,
+      "learning_rate": 2.3008849557522124e-05,
+      "loss": 1.2058,
+      "step": 13
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 1.2113099834106,
+      "learning_rate": 2.4778761061946905e-05,
+      "loss": 1.0821,
+      "step": 14
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8054362701984041,
+      "learning_rate": 2.6548672566371686e-05,
+      "loss": 1.0796,
+      "step": 15
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.8062720230664518,
+      "learning_rate": 2.831858407079646e-05,
+      "loss": 1.0231,
+      "step": 16
+    },
+    {
+      "epoch": 0.004533333333333334,
+      "grad_norm": 0.8626468497912609,
+      "learning_rate": 3.008849557522124e-05,
+      "loss": 0.9609,
+      "step": 17
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7948400258968932,
+      "learning_rate": 3.185840707964602e-05,
+      "loss": 1.0358,
+      "step": 18
+    },
+    {
+      "epoch": 0.005066666666666666,
+      "grad_norm": 0.8202331658253242,
+      "learning_rate": 3.3628318584070804e-05,
+      "loss": 0.9203,
+      "step": 19
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.70349059381211,
+      "learning_rate": 3.5398230088495574e-05,
+      "loss": 0.9488,
+      "step": 20
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.6245560185622042,
+      "learning_rate": 3.716814159292036e-05,
+      "loss": 0.9239,
+      "step": 21
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.7032002385338322,
+      "learning_rate": 3.893805309734514e-05,
+      "loss": 0.9343,
+      "step": 22
+    },
+    {
+      "epoch": 0.0061333333333333335,
+      "grad_norm": 0.6138853340560978,
+      "learning_rate": 4.0707964601769914e-05,
+      "loss": 0.8944,
+      "step": 23
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6252956302867735,
+      "learning_rate": 4.247787610619469e-05,
+      "loss": 0.9142,
+      "step": 24
+    },
+    {
+      "epoch": 0.006666666666666667,
+      "grad_norm": 0.5689669502910203,
+      "learning_rate": 4.4247787610619477e-05,
+      "loss": 0.8804,
+      "step": 25
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.6288100311989897,
+      "learning_rate": 4.601769911504425e-05,
+      "loss": 0.8975,
+      "step": 26
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.5525965844770411,
+      "learning_rate": 4.778761061946903e-05,
+      "loss": 0.8937,
+      "step": 27
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.5065552530181454,
+      "learning_rate": 4.955752212389381e-05,
+      "loss": 0.9136,
+      "step": 28
+    },
+    {
+      "epoch": 0.007733333333333333,
+      "grad_norm": 0.5201018248818462,
+      "learning_rate": 5.132743362831859e-05,
+      "loss": 0.8997,
+      "step": 29
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.5524579564998915,
+      "learning_rate": 5.309734513274337e-05,
+      "loss": 0.8422,
+      "step": 30
+    },
+    {
+      "epoch": 0.008266666666666667,
+      "grad_norm": 0.5303079983520723,
+      "learning_rate": 5.486725663716814e-05,
+      "loss": 0.8439,
+      "step": 31
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.5348941333565699,
+      "learning_rate": 5.663716814159292e-05,
+      "loss": 0.9265,
+      "step": 32
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.48407338678075934,
+      "learning_rate": 5.8407079646017705e-05,
+      "loss": 0.8567,
+      "step": 33
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.4930009016915646,
+      "learning_rate": 6.017699115044248e-05,
+      "loss": 0.8832,
+      "step": 34
+    },
+    {
+      "epoch": 0.009333333333333334,
+      "grad_norm": 0.5059503896009777,
+      "learning_rate": 6.194690265486725e-05,
+      "loss": 0.8384,
+      "step": 35
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.4968383701366383,
+      "learning_rate": 6.371681415929204e-05,
+      "loss": 0.8314,
+      "step": 36
+    },
+    {
+      "epoch": 0.009866666666666666,
+      "grad_norm": 0.5226825692592999,
+      "learning_rate": 6.548672566371682e-05,
+      "loss": 0.8601,
+      "step": 37
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.5045947528163728,
+      "learning_rate": 6.725663716814161e-05,
+      "loss": 0.8369,
+      "step": 38
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.5293030836072474,
+      "learning_rate": 6.902654867256638e-05,
+      "loss": 0.7889,
+      "step": 39
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.47708776202713943,
+      "learning_rate": 7.079646017699115e-05,
+      "loss": 0.8008,
+      "step": 40
+    },
+    {
+      "epoch": 0.010933333333333333,
+      "grad_norm": 0.5686752073899266,
+      "learning_rate": 7.256637168141593e-05,
+      "loss": 0.8019,
+      "step": 41
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.544125533975138,
+      "learning_rate": 7.433628318584072e-05,
+      "loss": 0.8626,
+      "step": 42
+    },
+    {
+      "epoch": 0.011466666666666667,
+      "grad_norm": 0.4898347653326332,
+      "learning_rate": 7.610619469026549e-05,
+      "loss": 0.8613,
+      "step": 43
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.45839478731360894,
+      "learning_rate": 7.787610619469027e-05,
+      "loss": 0.7437,
+      "step": 44
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.4901458596712785,
+      "learning_rate": 7.964601769911504e-05,
+      "loss": 0.7739,
+      "step": 45
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.48317563597167756,
+      "learning_rate": 8.141592920353983e-05,
+      "loss": 0.8318,
+      "step": 46
+    },
+    {
+      "epoch": 0.012533333333333334,
+      "grad_norm": 0.4886339551554164,
+      "learning_rate": 8.31858407079646e-05,
+      "loss": 0.8458,
+      "step": 47
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.46895080440104325,
+      "learning_rate": 8.495575221238938e-05,
+      "loss": 0.8192,
+      "step": 48
+    },
+    {
+      "epoch": 0.013066666666666667,
+      "grad_norm": 0.45837188999051376,
+      "learning_rate": 8.672566371681417e-05,
+      "loss": 0.7587,
+      "step": 49
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.5384787634584273,
+      "learning_rate": 8.849557522123895e-05,
+      "loss": 0.8482,
+      "step": 50
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.6380236500653841,
+      "learning_rate": 9.026548672566371e-05,
+      "loss": 0.8591,
+      "step": 51
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.4719389003968988,
+      "learning_rate": 9.20353982300885e-05,
+      "loss": 0.7666,
+      "step": 52
+    },
+    {
+      "epoch": 0.014133333333333333,
+      "grad_norm": 0.44310129818703503,
+      "learning_rate": 9.380530973451328e-05,
+      "loss": 0.8256,
+      "step": 53
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.44888747088563763,
+      "learning_rate": 9.557522123893806e-05,
+      "loss": 0.7907,
+      "step": 54
+    },
+    {
+      "epoch": 0.014666666666666666,
+      "grad_norm": 0.44470853697317947,
+      "learning_rate": 9.734513274336283e-05,
+      "loss": 0.7779,
+      "step": 55
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.4553984167393368,
+      "learning_rate": 9.911504424778762e-05,
+      "loss": 0.764,
+      "step": 56
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.4337441525847164,
+      "learning_rate": 0.00010088495575221239,
+      "loss": 0.8364,
+      "step": 57
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.4725986162131283,
+      "learning_rate": 0.00010265486725663717,
+      "loss": 0.846,
+      "step": 58
+    },
+    {
+      "epoch": 0.015733333333333332,
+      "grad_norm": 0.46683580635871025,
+      "learning_rate": 0.00010442477876106196,
+      "loss": 0.7912,
+      "step": 59
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.4606875357871431,
+      "learning_rate": 0.00010619469026548674,
+      "loss": 0.8111,
+      "step": 60
+    },
+    {
+      "epoch": 0.016266666666666665,
+      "grad_norm": 0.4588149809457491,
+      "learning_rate": 0.0001079646017699115,
+      "loss": 0.7577,
+      "step": 61
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.4319312435830963,
+      "learning_rate": 0.00010973451327433629,
+      "loss": 0.7824,
+      "step": 62
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.4865410084194108,
+      "learning_rate": 0.00011150442477876106,
+      "loss": 0.731,
+      "step": 63
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.47980579123461414,
+      "learning_rate": 0.00011327433628318584,
+      "loss": 0.807,
+      "step": 64
+    },
+    {
+      "epoch": 0.017333333333333333,
+      "grad_norm": 0.4482407983036267,
+      "learning_rate": 0.00011504424778761063,
+      "loss": 0.7851,
+      "step": 65
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.4728595754373952,
+      "learning_rate": 0.00011681415929203541,
+      "loss": 0.7581,
+      "step": 66
+    },
+    {
+      "epoch": 0.017866666666666666,
+      "grad_norm": 0.4634381144472228,
+      "learning_rate": 0.0001185840707964602,
+      "loss": 0.7178,
+      "step": 67
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.44357198821834165,
+      "learning_rate": 0.00012035398230088497,
+      "loss": 0.8162,
+      "step": 68
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.4350869015181407,
+      "learning_rate": 0.00012212389380530974,
+      "loss": 0.6976,
+      "step": 69
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.4468609647095294,
+      "learning_rate": 0.0001238938053097345,
+      "loss": 0.8209,
+      "step": 70
+    },
+    {
+      "epoch": 0.018933333333333333,
+      "grad_norm": 0.46082477622959217,
+      "learning_rate": 0.0001256637168141593,
+      "loss": 0.7785,
+      "step": 71
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.4617330853753975,
+      "learning_rate": 0.00012743362831858408,
+      "loss": 0.8422,
+      "step": 72
+    },
+    {
+      "epoch": 0.019466666666666667,
+      "grad_norm": 0.4454983705630943,
+      "learning_rate": 0.00012920353982300885,
+      "loss": 0.6729,
+      "step": 73
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.44214950569023675,
+      "learning_rate": 0.00013097345132743365,
+      "loss": 0.8234,
+      "step": 74
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.4324575762207759,
+      "learning_rate": 0.00013274336283185842,
+      "loss": 0.7741,
+      "step": 75
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.43153424291430115,
+      "learning_rate": 0.00013451327433628321,
+      "loss": 0.764,
+      "step": 76
+    },
+    {
+      "epoch": 0.020533333333333334,
+      "grad_norm": 0.45034174286548495,
+      "learning_rate": 0.00013628318584070796,
+      "loss": 0.8018,
+      "step": 77
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.4464786137625371,
+      "learning_rate": 0.00013805309734513276,
+      "loss": 0.7607,
+      "step": 78
+    },
+    {
+      "epoch": 0.021066666666666668,
+      "grad_norm": 0.4590769909090508,
+      "learning_rate": 0.00013982300884955753,
+      "loss": 0.8108,
+      "step": 79
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.4463713947630006,
+      "learning_rate": 0.0001415929203539823,
+      "loss": 0.7346,
+      "step": 80
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.41533645822148124,
+      "learning_rate": 0.0001433628318584071,
+      "loss": 0.7438,
+      "step": 81
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.4740827050643028,
+      "learning_rate": 0.00014513274336283187,
+      "loss": 0.7964,
+      "step": 82
+    },
+    {
+      "epoch": 0.022133333333333335,
+      "grad_norm": 0.44486736334646143,
+      "learning_rate": 0.00014690265486725664,
+      "loss": 0.7707,
+      "step": 83
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.4302755386054654,
+      "learning_rate": 0.00014867256637168144,
+      "loss": 0.7948,
+      "step": 84
+    },
+    {
+      "epoch": 0.02266666666666667,
+      "grad_norm": 0.4559094561783215,
+      "learning_rate": 0.00015044247787610618,
+      "loss": 0.8243,
+      "step": 85
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.4900569264486964,
+      "learning_rate": 0.00015221238938053098,
+      "loss": 0.755,
+      "step": 86
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.4411264029207737,
+      "learning_rate": 0.00015398230088495575,
+      "loss": 0.7284,
+      "step": 87
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.4380314589116113,
+      "learning_rate": 0.00015575221238938055,
+      "loss": 0.758,
+      "step": 88
+    },
+    {
+      "epoch": 0.023733333333333332,
+      "grad_norm": 0.4464592706350816,
+      "learning_rate": 0.00015752212389380532,
+      "loss": 0.7402,
+      "step": 89
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4740249465879794,
+      "learning_rate": 0.0001592920353982301,
+      "loss": 0.7877,
+      "step": 90
+    },
+    {
+      "epoch": 0.024266666666666666,
+      "grad_norm": 0.4454949189834306,
+      "learning_rate": 0.0001610619469026549,
+      "loss": 0.7191,
+      "step": 91
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.5050326165623321,
+      "learning_rate": 0.00016283185840707966,
+      "loss": 0.826,
+      "step": 92
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.44453758782037683,
+      "learning_rate": 0.00016460176991150443,
+      "loss": 0.7437,
+      "step": 93
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.4336087295604444,
+      "learning_rate": 0.0001663716814159292,
+      "loss": 0.7969,
+      "step": 94
+    },
+    {
+      "epoch": 0.025333333333333333,
+      "grad_norm": 0.4308296907448977,
+      "learning_rate": 0.000168141592920354,
+      "loss": 0.7346,
+      "step": 95
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4477684697781309,
+      "learning_rate": 0.00016991150442477877,
+      "loss": 0.7826,
+      "step": 96
+    },
+    {
+      "epoch": 0.025866666666666666,
+      "grad_norm": 0.43116255885853144,
+      "learning_rate": 0.00017168141592920354,
+      "loss": 0.7633,
+      "step": 97
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.45304995086487815,
+      "learning_rate": 0.00017345132743362834,
+      "loss": 0.7727,
+      "step": 98
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.4082081367581844,
+      "learning_rate": 0.0001752212389380531,
+      "loss": 0.741,
+      "step": 99
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.4286344877289435,
+      "learning_rate": 0.0001769911504424779,
+      "loss": 0.7689,
+      "step": 100
+    },
+    {
+      "epoch": 0.026933333333333333,
+      "grad_norm": 0.432430752273439,
+      "learning_rate": 0.00017876106194690265,
+      "loss": 0.8096,
+      "step": 101
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.46785258504213295,
+      "learning_rate": 0.00018053097345132742,
+      "loss": 0.7556,
+      "step": 102
+    },
+    {
+      "epoch": 0.027466666666666667,
+      "grad_norm": 0.4609078171732939,
+      "learning_rate": 0.00018230088495575222,
+      "loss": 0.8028,
+      "step": 103
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.4120994332670511,
+      "learning_rate": 0.000184070796460177,
+      "loss": 0.7932,
+      "step": 104
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.44499469315040363,
+      "learning_rate": 0.0001858407079646018,
+      "loss": 0.7785,
+      "step": 105
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.46531373423169303,
+      "learning_rate": 0.00018761061946902656,
+      "loss": 0.7908,
+      "step": 106
+    },
+    {
+      "epoch": 0.028533333333333334,
+      "grad_norm": 0.4252555869358924,
+      "learning_rate": 0.00018938053097345133,
+      "loss": 0.7737,
+      "step": 107
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4405067233148738,
+      "learning_rate": 0.00019115044247787613,
+      "loss": 0.7763,
+      "step": 108
+    },
+    {
+      "epoch": 0.029066666666666668,
+      "grad_norm": 0.4079319254410207,
+      "learning_rate": 0.00019292035398230087,
+      "loss": 0.7709,
+      "step": 109
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.4184085086369884,
+      "learning_rate": 0.00019469026548672567,
+      "loss": 0.7483,
+      "step": 110
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.45950580294816856,
+      "learning_rate": 0.00019646017699115044,
+      "loss": 0.772,
+      "step": 111
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.44509191760629474,
+      "learning_rate": 0.00019823008849557524,
+      "loss": 0.7477,
+      "step": 112
+    },
+    {
+      "epoch": 0.030133333333333335,
+      "grad_norm": 0.4295996816553574,
+      "learning_rate": 0.0002,
+      "loss": 0.7339,
+      "step": 113
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.4364555498847836,
+      "learning_rate": 0.00019999996269361907,
+      "loss": 0.7499,
+      "step": 114
+    },
+    {
+      "epoch": 0.030666666666666665,
+      "grad_norm": 0.45973042163769945,
+      "learning_rate": 0.00019999985077450406,
+      "loss": 0.7509,
+      "step": 115
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.45832747766303494,
+      "learning_rate": 0.0001999996642427385,
+      "loss": 0.764,
+      "step": 116
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.4190613732773172,
+      "learning_rate": 0.00019999940309846159,
+      "loss": 0.7682,
+      "step": 117
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.43455127814284006,
+      "learning_rate": 0.00019999906734186813,
+      "loss": 0.7423,
+      "step": 118
+    },
+    {
+      "epoch": 0.031733333333333336,
+      "grad_norm": 0.39579327955490035,
+      "learning_rate": 0.00019999865697320867,
+      "loss": 0.7257,
+      "step": 119
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.44288727123545557,
+      "learning_rate": 0.0001999981719927894,
+      "loss": 0.7668,
+      "step": 120
+    },
+    {
+      "epoch": 0.032266666666666666,
+      "grad_norm": 0.44031845865834995,
+      "learning_rate": 0.00019999761240097215,
+      "loss": 0.8193,
+      "step": 121
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.42671499637114035,
+      "learning_rate": 0.00019999697819817448,
+      "loss": 0.7412,
+      "step": 122
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.43688245149224186,
+      "learning_rate": 0.00019999626938486956,
+      "loss": 0.7553,
+      "step": 123
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.4081876652320984,
+      "learning_rate": 0.00019999548596158625,
+      "loss": 0.7363,
+      "step": 124
+    },
+    {
+      "epoch": 0.03333333333333333,
+      "grad_norm": 0.3949348426439058,
+      "learning_rate": 0.00019999462792890912,
+      "loss": 0.7375,
+      "step": 125
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.436949348455344,
+      "learning_rate": 0.00019999369528747834,
+      "loss": 0.7542,
+      "step": 126
+    },
+    {
+      "epoch": 0.03386666666666667,
+      "grad_norm": 0.44127480376228306,
+      "learning_rate": 0.00019999268803798977,
+      "loss": 0.8077,
+      "step": 127
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.4278322129090262,
+      "learning_rate": 0.000199991606181195,
+      "loss": 0.7472,
+      "step": 128
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.42654970123056357,
+      "learning_rate": 0.0001999904497179012,
+      "loss": 0.7304,
+      "step": 129
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.49695312757498045,
+      "learning_rate": 0.00019998921864897121,
+      "loss": 0.7919,
+      "step": 130
+    },
+    {
+      "epoch": 0.03493333333333333,
+      "grad_norm": 0.42374704515274303,
+      "learning_rate": 0.00019998791297532362,
+      "loss": 0.719,
+      "step": 131
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.43558716586278284,
+      "learning_rate": 0.00019998653269793256,
+      "loss": 0.746,
+      "step": 132
+    },
+    {
+      "epoch": 0.03546666666666667,
+      "grad_norm": 0.40823667620717713,
+      "learning_rate": 0.000199985077817828,
+      "loss": 0.7034,
+      "step": 133
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.44248614365245026,
+      "learning_rate": 0.00019998354833609537,
+      "loss": 0.8261,
+      "step": 134
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.4615866263452828,
+      "learning_rate": 0.00019998194425387586,
+      "loss": 0.7399,
+      "step": 135
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.43962461064283653,
+      "learning_rate": 0.00019998026557236636,
+      "loss": 0.7973,
+      "step": 136
+    },
+    {
+      "epoch": 0.036533333333333334,
+      "grad_norm": 0.41934926245557447,
+      "learning_rate": 0.0001999785122928194,
+      "loss": 0.7024,
+      "step": 137
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4194356667697614,
+      "learning_rate": 0.00019997668441654312,
+      "loss": 0.7685,
+      "step": 138
+    },
+    {
+      "epoch": 0.037066666666666664,
+      "grad_norm": 0.4182590192400099,
+      "learning_rate": 0.00019997478194490133,
+      "loss": 0.7646,
+      "step": 139
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.42231129891707286,
+      "learning_rate": 0.00019997280487931356,
+      "loss": 0.7258,
+      "step": 140
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.44714167228108054,
+      "learning_rate": 0.00019997075322125492,
+      "loss": 0.7829,
+      "step": 141
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.40359445738189087,
+      "learning_rate": 0.00019996862697225622,
+      "loss": 0.7325,
+      "step": 142
+    },
+    {
+      "epoch": 0.03813333333333333,
+      "grad_norm": 0.4064228325069067,
+      "learning_rate": 0.0001999664261339039,
+      "loss": 0.727,
+      "step": 143
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4184207142338632,
+      "learning_rate": 0.00019996415070784007,
+      "loss": 0.7871,
+      "step": 144
+    },
+    {
+      "epoch": 0.03866666666666667,
+      "grad_norm": 0.40278141773965426,
+      "learning_rate": 0.0001999618006957625,
+      "loss": 0.7154,
+      "step": 145
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.4218347867198309,
+      "learning_rate": 0.00019995937609942462,
+      "loss": 0.7294,
+      "step": 146
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.4288114460153231,
+      "learning_rate": 0.00019995687692063544,
+      "loss": 0.7619,
+      "step": 147
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.4362730719846335,
+      "learning_rate": 0.0001999543031612597,
+      "loss": 0.805,
+      "step": 148
+    },
+    {
+      "epoch": 0.039733333333333336,
+      "grad_norm": 0.46119959525223236,
+      "learning_rate": 0.00019995165482321773,
+      "loss": 0.7192,
+      "step": 149
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.4255994275960042,
+      "learning_rate": 0.00019994893190848555,
+      "loss": 0.6974,
+      "step": 150
+    },
+    {
+      "epoch": 0.040266666666666666,
+      "grad_norm": 0.39591345422504165,
+      "learning_rate": 0.00019994613441909483,
+      "loss": 0.7242,
+      "step": 151
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.45645688244270527,
+      "learning_rate": 0.00019994326235713277,
+      "loss": 0.7561,
+      "step": 152
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.4238047747089797,
+      "learning_rate": 0.00019994031572474237,
+      "loss": 0.75,
+      "step": 153
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.4163029492160629,
+      "learning_rate": 0.00019993729452412213,
+      "loss": 0.7097,
+      "step": 154
+    },
+    {
+      "epoch": 0.04133333333333333,
+      "grad_norm": 0.4412984717208657,
+      "learning_rate": 0.00019993419875752633,
+      "loss": 0.7913,
+      "step": 155
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.44306476010633583,
+      "learning_rate": 0.00019993102842726473,
+      "loss": 0.7528,
+      "step": 156
+    },
+    {
+      "epoch": 0.04186666666666667,
+      "grad_norm": 0.3974985905615024,
+      "learning_rate": 0.00019992778353570282,
+      "loss": 0.8091,
+      "step": 157
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.4347226330417167,
+      "learning_rate": 0.00019992446408526176,
+      "loss": 0.7327,
+      "step": 158
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.41402234147777023,
+      "learning_rate": 0.0001999210700784182,
+      "loss": 0.7646,
+      "step": 159
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.40254167313476474,
+      "learning_rate": 0.00019991760151770457,
+      "loss": 0.7505,
+      "step": 160
+    },
+    {
+      "epoch": 0.04293333333333333,
+      "grad_norm": 0.4044806910369341,
+      "learning_rate": 0.00019991405840570886,
+      "loss": 0.6412,
+      "step": 161
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.43014238855369424,
+      "learning_rate": 0.0001999104407450746,
+      "loss": 0.7195,
+      "step": 162
+    },
+    {
+      "epoch": 0.04346666666666667,
+      "grad_norm": 0.42138360468041963,
+      "learning_rate": 0.00019990674853850111,
+      "loss": 0.7595,
+      "step": 163
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.42512241780112225,
+      "learning_rate": 0.00019990298178874322,
+      "loss": 0.7241,
+      "step": 164
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.4329587059045856,
+      "learning_rate": 0.00019989914049861143,
+      "loss": 0.7581,
+      "step": 165
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.41165876976626564,
+      "learning_rate": 0.00019989522467097178,
+      "loss": 0.7899,
+      "step": 166
+    },
+    {
+      "epoch": 0.044533333333333334,
+      "grad_norm": 0.46486524170602445,
+      "learning_rate": 0.00019989123430874602,
+      "loss": 0.7256,
+      "step": 167
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3932375147112573,
+      "learning_rate": 0.0001998871694149114,
+      "loss": 0.7006,
+      "step": 168
+    },
+    {
+      "epoch": 0.045066666666666665,
+      "grad_norm": 0.4065406693697278,
+      "learning_rate": 0.00019988302999250098,
+      "loss": 0.7877,
+      "step": 169
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.41237994290530905,
+      "learning_rate": 0.0001998788160446032,
+      "loss": 0.7435,
+      "step": 170
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.41724585168439526,
+      "learning_rate": 0.0001998745275743622,
+      "loss": 0.7635,
+      "step": 171
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.39239738733273327,
+      "learning_rate": 0.00019987016458497778,
+      "loss": 0.7213,
+      "step": 172
+    },
+    {
+      "epoch": 0.04613333333333333,
+      "grad_norm": 0.4153912236878267,
+      "learning_rate": 0.00019986572707970525,
+      "loss": 0.6877,
+      "step": 173
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.40897316873652534,
+      "learning_rate": 0.00019986121506185555,
+      "loss": 0.765,
+      "step": 174
+    },
+    {
+      "epoch": 0.04666666666666667,
+      "grad_norm": 0.42636975728495263,
+      "learning_rate": 0.00019985662853479525,
+      "loss": 0.7222,
+      "step": 175
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.4087494019314695,
+      "learning_rate": 0.00019985196750194647,
+      "loss": 0.6956,
+      "step": 176
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.40632207244215324,
+      "learning_rate": 0.0001998472319667869,
+      "loss": 0.7523,
+      "step": 177
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.4005990115694852,
+      "learning_rate": 0.00019984242193284995,
+      "loss": 0.7262,
+      "step": 178
+    },
+    {
+      "epoch": 0.047733333333333336,
+      "grad_norm": 0.40921455765118137,
+      "learning_rate": 0.00019983753740372443,
+      "loss": 0.7415,
+      "step": 179
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.39320300572180766,
+      "learning_rate": 0.00019983257838305485,
+      "loss": 0.7473,
+      "step": 180
+    },
+    {
+      "epoch": 0.048266666666666666,
+      "grad_norm": 0.46521592954335056,
+      "learning_rate": 0.00019982754487454126,
+      "loss": 0.7795,
+      "step": 181
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.420772938160873,
+      "learning_rate": 0.00019982243688193934,
+      "loss": 0.7604,
+      "step": 182
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.4159098112445086,
+      "learning_rate": 0.00019981725440906023,
+      "loss": 0.7592,
+      "step": 183
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.4142585530411165,
+      "learning_rate": 0.0001998119974597708,
+      "loss": 0.7427,
+      "step": 184
+    },
+    {
+      "epoch": 0.04933333333333333,
+      "grad_norm": 0.3965954403079215,
+      "learning_rate": 0.00019980666603799333,
+      "loss": 0.7122,
+      "step": 185
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.38253711216327274,
+      "learning_rate": 0.0001998012601477058,
+      "loss": 0.7303,
+      "step": 186
+    },
+    {
+      "epoch": 0.04986666666666666,
+      "grad_norm": 0.3819538197346378,
+      "learning_rate": 0.00019979577979294168,
+      "loss": 0.7364,
+      "step": 187
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.4102247800801515,
+      "learning_rate": 0.00019979022497779002,
+      "loss": 0.7478,
+      "step": 188
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.39816195365377816,
+      "learning_rate": 0.0001997845957063954,
+      "loss": 0.7482,
+      "step": 189
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.40065497357861385,
+      "learning_rate": 0.00019977889198295794,
+      "loss": 0.6791,
+      "step": 190
+    },
+    {
+      "epoch": 0.05093333333333333,
+      "grad_norm": 0.4185398481679138,
+      "learning_rate": 0.0001997731138117334,
+      "loss": 0.7104,
+      "step": 191
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.41532835516077843,
+      "learning_rate": 0.00019976726119703305,
+      "loss": 0.6745,
+      "step": 192
+    },
+    {
+      "epoch": 0.05146666666666667,
+      "grad_norm": 0.4849381478848775,
+      "learning_rate": 0.00019976133414322366,
+      "loss": 0.7565,
+      "step": 193
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.42890622187771793,
+      "learning_rate": 0.00019975533265472755,
+      "loss": 0.7601,
+      "step": 194
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.40645431891112266,
+      "learning_rate": 0.0001997492567360226,
+      "loss": 0.7471,
+      "step": 195
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.44050504688919007,
+      "learning_rate": 0.00019974310639164227,
+      "loss": 0.7172,
+      "step": 196
+    },
+    {
+      "epoch": 0.052533333333333335,
+      "grad_norm": 0.4175460735073087,
+      "learning_rate": 0.00019973688162617544,
+      "loss": 0.7852,
+      "step": 197
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.46387859545026644,
+      "learning_rate": 0.0001997305824442666,
+      "loss": 0.7344,
+      "step": 198
+    },
+    {
+      "epoch": 0.053066666666666665,
+      "grad_norm": 0.43950368771088844,
+      "learning_rate": 0.00019972420885061576,
+      "loss": 0.777,
+      "step": 199
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.47682984929324623,
+      "learning_rate": 0.00019971776084997842,
+      "loss": 0.6963,
+      "step": 200
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.4210968151337064,
+      "learning_rate": 0.00019971123844716562,
+      "loss": 0.7078,
+      "step": 201
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.4426667897608301,
+      "learning_rate": 0.00019970464164704389,
+      "loss": 0.7714,
+      "step": 202
+    },
+    {
+      "epoch": 0.05413333333333333,
+      "grad_norm": 0.3943173565109175,
+      "learning_rate": 0.0001996979704545353,
+      "loss": 0.7884,
+      "step": 203
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4071080597282432,
+      "learning_rate": 0.0001996912248746174,
+      "loss": 0.7364,
+      "step": 204
+    },
+    {
+      "epoch": 0.05466666666666667,
+      "grad_norm": 0.39539875302690025,
+      "learning_rate": 0.00019968440491232326,
+      "loss": 0.7742,
+      "step": 205
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.427477962182699,
+      "learning_rate": 0.00019967751057274144,
+      "loss": 0.7617,
+      "step": 206
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.40297635868881115,
+      "learning_rate": 0.00019967054186101598,
+      "loss": 0.7412,
+      "step": 207
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.4072669433506653,
+      "learning_rate": 0.00019966349878234647,
+      "loss": 0.7117,
+      "step": 208
+    },
+    {
+      "epoch": 0.055733333333333336,
+      "grad_norm": 0.4236907433036573,
+      "learning_rate": 0.00019965638134198792,
+      "loss": 0.7667,
+      "step": 209
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.41684491796587203,
+      "learning_rate": 0.00019964918954525085,
+      "loss": 0.7525,
+      "step": 210
+    },
+    {
+      "epoch": 0.056266666666666666,
+      "grad_norm": 0.4196558925944855,
+      "learning_rate": 0.00019964192339750128,
+      "loss": 0.7627,
+      "step": 211
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.4138313180882757,
+      "learning_rate": 0.00019963458290416064,
+      "loss": 0.7377,
+      "step": 212
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.3791671739181995,
+      "learning_rate": 0.0001996271680707059,
+      "loss": 0.6732,
+      "step": 213
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.4049548923517151,
+      "learning_rate": 0.00019961967890266946,
+      "loss": 0.7683,
+      "step": 214
+    },
+    {
+      "epoch": 0.05733333333333333,
+      "grad_norm": 0.43085041346392816,
+      "learning_rate": 0.00019961211540563917,
+      "loss": 0.7627,
+      "step": 215
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.48306971944769167,
+      "learning_rate": 0.00019960447758525846,
+      "loss": 0.7485,
+      "step": 216
+    },
+    {
+      "epoch": 0.057866666666666663,
+      "grad_norm": 0.4167352089373417,
+      "learning_rate": 0.00019959676544722602,
+      "loss": 0.7579,
+      "step": 217
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.3854266730684992,
+      "learning_rate": 0.00019958897899729613,
+      "loss": 0.7072,
+      "step": 218
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.4143261432972768,
+      "learning_rate": 0.0001995811182412785,
+      "loss": 0.7479,
+      "step": 219
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.39807525682944156,
+      "learning_rate": 0.00019957318318503822,
+      "loss": 0.7295,
+      "step": 220
+    },
+    {
+      "epoch": 0.05893333333333333,
+      "grad_norm": 0.42879920347124467,
+      "learning_rate": 0.00019956517383449583,
+      "loss": 0.7272,
+      "step": 221
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.3805555655205918,
+      "learning_rate": 0.00019955709019562742,
+      "loss": 0.7118,
+      "step": 222
+    },
+    {
+      "epoch": 0.05946666666666667,
+      "grad_norm": 0.4141547390911347,
+      "learning_rate": 0.0001995489322744643,
+      "loss": 0.7557,
+      "step": 223
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.4297484969084553,
+      "learning_rate": 0.0001995407000770934,
+      "loss": 0.7887,
+      "step": 224
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.4274267901647024,
+      "learning_rate": 0.00019953239360965695,
+      "loss": 0.7769,
+      "step": 225
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.38182301143404707,
+      "learning_rate": 0.00019952401287835268,
+      "loss": 0.6746,
+      "step": 226
+    },
+    {
+      "epoch": 0.060533333333333335,
+      "grad_norm": 0.40416336228997984,
+      "learning_rate": 0.00019951555788943364,
+      "loss": 0.7707,
+      "step": 227
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4088907253997118,
+      "learning_rate": 0.00019950702864920836,
+      "loss": 0.7583,
+      "step": 228
+    },
+    {
+      "epoch": 0.061066666666666665,
+      "grad_norm": 0.4010022725937896,
+      "learning_rate": 0.0001994984251640407,
+      "loss": 0.7512,
+      "step": 229
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.38961594471098804,
+      "learning_rate": 0.00019948974744035,
+      "loss": 0.7066,
+      "step": 230
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.39078296075160246,
+      "learning_rate": 0.00019948099548461096,
+      "loss": 0.6713,
+      "step": 231
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.3881355954199725,
+      "learning_rate": 0.0001994721693033536,
+      "loss": 0.6901,
+      "step": 232
+    },
+    {
+      "epoch": 0.06213333333333333,
+      "grad_norm": 0.39450258580406516,
+      "learning_rate": 0.00019946326890316345,
+      "loss": 0.6874,
+      "step": 233
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.3842897115626507,
+      "learning_rate": 0.00019945429429068127,
+      "loss": 0.6701,
+      "step": 234
+    },
+    {
+      "epoch": 0.06266666666666666,
+      "grad_norm": 0.42412588826941533,
+      "learning_rate": 0.00019944524547260332,
+      "loss": 0.7897,
+      "step": 235
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.39660675719735494,
+      "learning_rate": 0.00019943612245568114,
+      "loss": 0.7323,
+      "step": 236
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.452702035122642,
+      "learning_rate": 0.0001994269252467217,
+      "loss": 0.6731,
+      "step": 237
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.39058240002897643,
+      "learning_rate": 0.00019941765385258725,
+      "loss": 0.7142,
+      "step": 238
+    },
+    {
+      "epoch": 0.06373333333333334,
+      "grad_norm": 0.403458169929397,
+      "learning_rate": 0.00019940830828019546,
+      "loss": 0.7358,
+      "step": 239
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4030035336489462,
+      "learning_rate": 0.0001993988885365193,
+      "loss": 0.6907,
+      "step": 240
+    },
+    {
+      "epoch": 0.06426666666666667,
+      "grad_norm": 0.41307744746815045,
+      "learning_rate": 0.00019938939462858714,
+      "loss": 0.7133,
+      "step": 241
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.4162945649290952,
+      "learning_rate": 0.0001993798265634826,
+      "loss": 0.7393,
+      "step": 242
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.44325936934792853,
+      "learning_rate": 0.0001993701843483447,
+      "loss": 0.7154,
+      "step": 243
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.3811590653484472,
+      "learning_rate": 0.00019936046799036777,
+      "loss": 0.6902,
+      "step": 244
+    },
+    {
+      "epoch": 0.06533333333333333,
+      "grad_norm": 0.4170311529581271,
+      "learning_rate": 0.00019935067749680147,
+      "loss": 0.6933,
+      "step": 245
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.38738516880995455,
+      "learning_rate": 0.00019934081287495067,
+      "loss": 0.7201,
+      "step": 246
+    },
+    {
+      "epoch": 0.06586666666666667,
+      "grad_norm": 0.4371298094780602,
+      "learning_rate": 0.00019933087413217575,
+      "loss": 0.7512,
+      "step": 247
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.40615564597816506,
+      "learning_rate": 0.0001993208612758922,
+      "loss": 0.7315,
+      "step": 248
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.36799596485043146,
+      "learning_rate": 0.00019931077431357096,
+      "loss": 0.6731,
+      "step": 249
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.4075115948919754,
+      "learning_rate": 0.0001993006132527381,
+      "loss": 0.7257,
+      "step": 250
+    },
+    {
+      "epoch": 0.06693333333333333,
+      "grad_norm": 0.41397887078278667,
+      "learning_rate": 0.00019929037810097514,
+      "loss": 0.7463,
+      "step": 251
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.39935735721022253,
+      "learning_rate": 0.0001992800688659188,
+      "loss": 0.7212,
+      "step": 252
+    },
+    {
+      "epoch": 0.06746666666666666,
+      "grad_norm": 0.397466312253459,
+      "learning_rate": 0.00019926968555526107,
+      "loss": 0.7401,
+      "step": 253
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.3993745573428489,
+      "learning_rate": 0.00019925922817674922,
+      "loss": 0.7534,
+      "step": 254
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.4135574441728364,
+      "learning_rate": 0.00019924869673818577,
+      "loss": 0.669,
+      "step": 255
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.3932811852789586,
+      "learning_rate": 0.00019923809124742858,
+      "loss": 0.6537,
+      "step": 256
+    },
+    {
+      "epoch": 0.06853333333333333,
+      "grad_norm": 0.4306549060493546,
+      "learning_rate": 0.00019922741171239064,
+      "loss": 0.7526,
+      "step": 257
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4485140229636536,
+      "learning_rate": 0.00019921665814104028,
+      "loss": 0.7535,
+      "step": 258
+    },
+    {
+      "epoch": 0.06906666666666667,
+      "grad_norm": 0.407145027253977,
+      "learning_rate": 0.00019920583054140102,
+      "loss": 0.7281,
+      "step": 259
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.444754696972281,
+      "learning_rate": 0.00019919492892155164,
+      "loss": 0.7493,
+      "step": 260
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.3834940923800163,
+      "learning_rate": 0.00019918395328962613,
+      "loss": 0.7663,
+      "step": 261
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.5135946291558379,
+      "learning_rate": 0.00019917290365381373,
+      "loss": 0.7565,
+      "step": 262
+    },
+    {
+      "epoch": 0.07013333333333334,
+      "grad_norm": 0.41456780545003297,
+      "learning_rate": 0.00019916178002235885,
+      "loss": 0.7214,
+      "step": 263
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3817303462004924,
+      "learning_rate": 0.00019915058240356118,
+      "loss": 0.7189,
+      "step": 264
+    },
+    {
+      "epoch": 0.07066666666666667,
+      "grad_norm": 0.6253105182319767,
+      "learning_rate": 0.00019913931080577552,
+      "loss": 0.7168,
+      "step": 265
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.41538467401381385,
+      "learning_rate": 0.00019912796523741198,
+      "loss": 0.7452,
+      "step": 266
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.44820153303680027,
+      "learning_rate": 0.00019911654570693574,
+      "loss": 0.6903,
+      "step": 267
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.42349113398474625,
+      "learning_rate": 0.0001991050522228673,
+      "loss": 0.7091,
+      "step": 268
+    },
+    {
+      "epoch": 0.07173333333333333,
+      "grad_norm": 0.38713460495344987,
+      "learning_rate": 0.00019909348479378217,
+      "loss": 0.7084,
+      "step": 269
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.42696161551154627,
+      "learning_rate": 0.0001990818434283112,
+      "loss": 0.6797,
+      "step": 270
+    },
+    {
+      "epoch": 0.07226666666666667,
+      "grad_norm": 0.4044084040674422,
+      "learning_rate": 0.00019907012813514033,
+      "loss": 0.6659,
+      "step": 271
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.41744902477227236,
+      "learning_rate": 0.00019905833892301065,
+      "loss": 0.7039,
+      "step": 272
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.40150435958851094,
+      "learning_rate": 0.0001990464758007184,
+      "loss": 0.7214,
+      "step": 273
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.4284395338256494,
+      "learning_rate": 0.000199034538777115,
+      "loss": 0.7717,
+      "step": 274
+    },
+    {
+      "epoch": 0.07333333333333333,
+      "grad_norm": 0.383533857037192,
+      "learning_rate": 0.000199022527861107,
+      "loss": 0.658,
+      "step": 275
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3900923018002736,
+      "learning_rate": 0.00019901044306165606,
+      "loss": 0.7199,
+      "step": 276
+    },
+    {
+      "epoch": 0.07386666666666666,
+      "grad_norm": 0.3947161443368059,
+      "learning_rate": 0.00019899828438777899,
+      "loss": 0.7204,
+      "step": 277
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.3763194767049703,
+      "learning_rate": 0.00019898605184854774,
+      "loss": 0.682,
+      "step": 278
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.39356463846876866,
+      "learning_rate": 0.00019897374545308928,
+      "loss": 0.715,
+      "step": 279
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.40231244413746337,
+      "learning_rate": 0.0001989613652105858,
+      "loss": 0.7184,
+      "step": 280
+    },
+    {
+      "epoch": 0.07493333333333334,
+      "grad_norm": 0.40646593423145344,
+      "learning_rate": 0.00019894891113027456,
+      "loss": 0.6882,
+      "step": 281
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.39043792467687116,
+      "learning_rate": 0.00019893638322144788,
+      "loss": 0.7521,
+      "step": 282
+    },
+    {
+      "epoch": 0.07546666666666667,
+      "grad_norm": 0.3976099281474279,
+      "learning_rate": 0.0001989237814934531,
+      "loss": 0.6594,
+      "step": 283
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.41092639532736164,
+      "learning_rate": 0.00019891110595569283,
+      "loss": 0.7229,
+      "step": 284
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.3810076008125446,
+      "learning_rate": 0.00019889835661762457,
+      "loss": 0.6801,
+      "step": 285
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.38830977529156335,
+      "learning_rate": 0.00019888553348876097,
+      "loss": 0.708,
+      "step": 286
+    },
+    {
+      "epoch": 0.07653333333333333,
+      "grad_norm": 0.416149543478018,
+      "learning_rate": 0.00019887263657866972,
+      "loss": 0.7352,
+      "step": 287
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4053726072491817,
+      "learning_rate": 0.0001988596658969736,
+      "loss": 0.7113,
+      "step": 288
+    },
+    {
+      "epoch": 0.07706666666666667,
+      "grad_norm": 0.4145930541903273,
+      "learning_rate": 0.00019884662145335034,
+      "loss": 0.701,
+      "step": 289
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.40095414721333383,
+      "learning_rate": 0.00019883350325753277,
+      "loss": 0.6995,
+      "step": 290
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.40280416171137895,
+      "learning_rate": 0.00019882031131930874,
+      "loss": 0.7844,
+      "step": 291
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.37286594222117764,
+      "learning_rate": 0.00019880704564852113,
+      "loss": 0.7309,
+      "step": 292
+    },
+    {
+      "epoch": 0.07813333333333333,
+      "grad_norm": 0.39936297044802194,
+      "learning_rate": 0.0001987937062550678,
+      "loss": 0.7053,
+      "step": 293
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.39061374120110426,
+      "learning_rate": 0.0001987802931489017,
+      "loss": 0.7232,
+      "step": 294
+    },
+    {
+      "epoch": 0.07866666666666666,
+      "grad_norm": 0.39757752193953794,
+      "learning_rate": 0.00019876680634003068,
+      "loss": 0.7429,
+      "step": 295
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.4104169489448244,
+      "learning_rate": 0.00019875324583851757,
+      "loss": 0.7115,
+      "step": 296
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.37368511859538683,
+      "learning_rate": 0.00019873961165448032,
+      "loss": 0.7219,
+      "step": 297
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.4199598853000963,
+      "learning_rate": 0.00019872590379809172,
+      "loss": 0.7756,
+      "step": 298
+    },
+    {
+      "epoch": 0.07973333333333334,
+      "grad_norm": 0.40882562410751905,
+      "learning_rate": 0.00019871212227957961,
+      "loss": 0.6989,
+      "step": 299
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4090549309973313,
+      "learning_rate": 0.00019869826710922675,
+      "loss": 0.7451,
+      "step": 300
+    },
+    {
+      "epoch": 0.08026666666666667,
+      "grad_norm": 0.40162365150041546,
+      "learning_rate": 0.00019868433829737083,
+      "loss": 0.7506,
+      "step": 301
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.4094513705677747,
+      "learning_rate": 0.00019867033585440456,
+      "loss": 0.7675,
+      "step": 302
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.39038982751168183,
+      "learning_rate": 0.00019865625979077555,
+      "loss": 0.7277,
+      "step": 303
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.41323184742019836,
+      "learning_rate": 0.00019864211011698634,
+      "loss": 0.7182,
+      "step": 304
+    },
+    {
+      "epoch": 0.08133333333333333,
+      "grad_norm": 0.38188370467533733,
+      "learning_rate": 0.00019862788684359438,
+      "loss": 0.6865,
+      "step": 305
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.38843917921763027,
+      "learning_rate": 0.00019861358998121204,
+      "loss": 0.7148,
+      "step": 306
+    },
+    {
+      "epoch": 0.08186666666666667,
+      "grad_norm": 0.37220997337943657,
+      "learning_rate": 0.00019859921954050664,
+      "loss": 0.7134,
+      "step": 307
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.3986392957863737,
+      "learning_rate": 0.00019858477553220033,
+      "loss": 0.7156,
+      "step": 308
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.3977418558092343,
+      "learning_rate": 0.0001985702579670702,
+      "loss": 0.7261,
+      "step": 309
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.3840774177569215,
+      "learning_rate": 0.00019855566685594815,
+      "loss": 0.7901,
+      "step": 310
+    },
+    {
+      "epoch": 0.08293333333333333,
+      "grad_norm": 0.37988919505690316,
+      "learning_rate": 0.00019854100220972112,
+      "loss": 0.7309,
+      "step": 311
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3707403548715134,
+      "learning_rate": 0.0001985262640393307,
+      "loss": 0.704,
+      "step": 312
+    },
+    {
+      "epoch": 0.08346666666666666,
+      "grad_norm": 0.3808201215582751,
+      "learning_rate": 0.00019851145235577354,
+      "loss": 0.6774,
+      "step": 313
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.4045590430944023,
+      "learning_rate": 0.00019849656717010094,
+      "loss": 0.7009,
+      "step": 314
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.407890590412557,
+      "learning_rate": 0.00019848160849341925,
+      "loss": 0.7121,
+      "step": 315
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.3998158812732857,
+      "learning_rate": 0.0001984665763368895,
+      "loss": 0.7352,
+      "step": 316
+    },
+    {
+      "epoch": 0.08453333333333334,
+      "grad_norm": 0.4398899486261411,
+      "learning_rate": 0.00019845147071172759,
+      "loss": 0.7312,
+      "step": 317
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.39424069500295306,
+      "learning_rate": 0.00019843629162920426,
+      "loss": 0.7057,
+      "step": 318
+    },
+    {
+      "epoch": 0.08506666666666667,
+      "grad_norm": 0.39213572853124057,
+      "learning_rate": 0.00019842103910064506,
+      "loss": 0.7003,
+      "step": 319
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.4088822832005749,
+      "learning_rate": 0.00019840571313743032,
+      "loss": 0.7312,
+      "step": 320
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.38220192094973704,
+      "learning_rate": 0.00019839031375099513,
+      "loss": 0.6551,
+      "step": 321
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.4297798800475834,
+      "learning_rate": 0.00019837484095282942,
+      "loss": 0.7285,
+      "step": 322
+    },
+    {
+      "epoch": 0.08613333333333334,
+      "grad_norm": 0.39065133633158505,
+      "learning_rate": 0.00019835929475447785,
+      "loss": 0.7,
+      "step": 323
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.3768072519822633,
+      "learning_rate": 0.0001983436751675399,
+      "loss": 0.686,
+      "step": 324
+    },
+    {
+      "epoch": 0.08666666666666667,
+      "grad_norm": 0.4033863317710327,
+      "learning_rate": 0.00019832798220366978,
+      "loss": 0.7154,
+      "step": 325
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.3970458373238512,
+      "learning_rate": 0.0001983122158745764,
+      "loss": 0.7407,
+      "step": 326
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.3833499500800447,
+      "learning_rate": 0.0001982963761920235,
+      "loss": 0.7371,
+      "step": 327
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.39051951392278295,
+      "learning_rate": 0.00019828046316782948,
+      "loss": 0.7139,
+      "step": 328
+    },
+    {
+      "epoch": 0.08773333333333333,
+      "grad_norm": 0.416335268121753,
+      "learning_rate": 0.0001982644768138675,
+      "loss": 0.7186,
+      "step": 329
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.39878078216361823,
+      "learning_rate": 0.0001982484171420654,
+      "loss": 0.7223,
+      "step": 330
+    },
+    {
+      "epoch": 0.08826666666666666,
+      "grad_norm": 0.4258223335265266,
+      "learning_rate": 0.00019823228416440575,
+      "loss": 0.6913,
+      "step": 331
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.36763344196266556,
+      "learning_rate": 0.00019821607789292583,
+      "loss": 0.7624,
+      "step": 332
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.3784547523972229,
+      "learning_rate": 0.00019819979833971755,
+      "loss": 0.7005,
+      "step": 333
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.41740648720516116,
+      "learning_rate": 0.00019818344551692757,
+      "loss": 0.7039,
+      "step": 334
+    },
+    {
+      "epoch": 0.08933333333333333,
+      "grad_norm": 0.3669460038794941,
+      "learning_rate": 0.00019816701943675718,
+      "loss": 0.7416,
+      "step": 335
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.47643637440544684,
+      "learning_rate": 0.0001981505201114623,
+      "loss": 0.7406,
+      "step": 336
+    },
+    {
+      "epoch": 0.08986666666666666,
+      "grad_norm": 0.4111404653795723,
+      "learning_rate": 0.0001981339475533536,
+      "loss": 0.7365,
+      "step": 337
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.398282186305748,
+      "learning_rate": 0.00019811730177479625,
+      "loss": 0.7166,
+      "step": 338
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.41489095670999077,
+      "learning_rate": 0.00019810058278821015,
+      "loss": 0.6831,
+      "step": 339
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.41367677985636275,
+      "learning_rate": 0.0001980837906060698,
+      "loss": 0.7237,
+      "step": 340
+    },
+    {
+      "epoch": 0.09093333333333334,
+      "grad_norm": 0.4118613955842153,
+      "learning_rate": 0.00019806692524090434,
+      "loss": 0.7493,
+      "step": 341
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.38049350260478093,
+      "learning_rate": 0.0001980499867052974,
+      "loss": 0.7206,
+      "step": 342
+    },
+    {
+      "epoch": 0.09146666666666667,
+      "grad_norm": 0.363861364552363,
+      "learning_rate": 0.0001980329750118874,
+      "loss": 0.6824,
+      "step": 343
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.36385625278410144,
+      "learning_rate": 0.00019801589017336715,
+      "loss": 0.6738,
+      "step": 344
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.4084914661595541,
+      "learning_rate": 0.00019799873220248415,
+      "loss": 0.7318,
+      "step": 345
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.38902592796422086,
+      "learning_rate": 0.00019798150111204047,
+      "loss": 0.7119,
+      "step": 346
+    },
+    {
+      "epoch": 0.09253333333333333,
+      "grad_norm": 0.3947885945538274,
+      "learning_rate": 0.00019796419691489264,
+      "loss": 0.7115,
+      "step": 347
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.3890436587380582,
+      "learning_rate": 0.00019794681962395183,
+      "loss": 0.7276,
+      "step": 348
+    },
+    {
+      "epoch": 0.09306666666666667,
+      "grad_norm": 0.3926882677435967,
+      "learning_rate": 0.00019792936925218372,
+      "loss": 0.6663,
+      "step": 349
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.36238182009460046,
+      "learning_rate": 0.00019791184581260848,
+      "loss": 0.6512,
+      "step": 350
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.3662753465749837,
+      "learning_rate": 0.00019789424931830087,
+      "loss": 0.6519,
+      "step": 351
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.38546599016871463,
+      "learning_rate": 0.00019787657978239014,
+      "loss": 0.6919,
+      "step": 352
+    },
+    {
+      "epoch": 0.09413333333333333,
+      "grad_norm": 0.374980384366503,
+      "learning_rate": 0.00019785883721806,
+      "loss": 0.7021,
+      "step": 353
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.3598183990882242,
+      "learning_rate": 0.00019784102163854862,
+      "loss": 0.7078,
+      "step": 354
+    },
+    {
+      "epoch": 0.09466666666666666,
+      "grad_norm": 0.3940805411387451,
+      "learning_rate": 0.00019782313305714873,
+      "loss": 0.7336,
+      "step": 355
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.36953792011424874,
+      "learning_rate": 0.00019780517148720753,
+      "loss": 0.6453,
+      "step": 356
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.3890574355800986,
+      "learning_rate": 0.0001977871369421266,
+      "loss": 0.6786,
+      "step": 357
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.4112663862907459,
+      "learning_rate": 0.000197769029435362,
+      "loss": 0.7032,
+      "step": 358
+    },
+    {
+      "epoch": 0.09573333333333334,
+      "grad_norm": 0.37062221805783596,
+      "learning_rate": 0.00019775084898042427,
+      "loss": 0.6423,
+      "step": 359
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3825463311199024,
+      "learning_rate": 0.00019773259559087837,
+      "loss": 0.6678,
+      "step": 360
+    },
+    {
+      "epoch": 0.09626666666666667,
+      "grad_norm": 0.3779182311924708,
+      "learning_rate": 0.0001977142692803436,
+      "loss": 0.6635,
+      "step": 361
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.4010129545052071,
+      "learning_rate": 0.00019769587006249382,
+      "loss": 0.7255,
+      "step": 362
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.3886084008931301,
+      "learning_rate": 0.0001976773979510571,
+      "loss": 0.7252,
+      "step": 363
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.3919435017107045,
+      "learning_rate": 0.000197658852959816,
+      "loss": 0.7018,
+      "step": 364
+    },
+    {
+      "epoch": 0.09733333333333333,
+      "grad_norm": 0.38254913244976363,
+      "learning_rate": 0.0001976402351026075,
+      "loss": 0.7206,
+      "step": 365
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4438518685757539,
+      "learning_rate": 0.00019762154439332289,
+      "loss": 0.6991,
+      "step": 366
+    },
+    {
+      "epoch": 0.09786666666666667,
+      "grad_norm": 0.41508929267611483,
+      "learning_rate": 0.00019760278084590777,
+      "loss": 0.7166,
+      "step": 367
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.43536758123081454,
+      "learning_rate": 0.0001975839444743622,
+      "loss": 0.7193,
+      "step": 368
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.3856740683698884,
+      "learning_rate": 0.00019756503529274046,
+      "loss": 0.7189,
+      "step": 369
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.39611104594234997,
+      "learning_rate": 0.00019754605331515128,
+      "loss": 0.7121,
+      "step": 370
+    },
+    {
+      "epoch": 0.09893333333333333,
+      "grad_norm": 0.37928103167529814,
+      "learning_rate": 0.0001975269985557576,
+      "loss": 0.6837,
+      "step": 371
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3888164429713909,
+      "learning_rate": 0.00019750787102877673,
+      "loss": 0.6697,
+      "step": 372
+    },
+    {
+      "epoch": 0.09946666666666666,
+      "grad_norm": 0.40899344492977824,
+      "learning_rate": 0.0001974886707484802,
+      "loss": 0.7001,
+      "step": 373
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.41039107754431725,
+      "learning_rate": 0.00019746939772919393,
+      "loss": 0.7342,
+      "step": 374
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.3765992456098516,
+      "learning_rate": 0.00019745005198529799,
+      "loss": 0.7021,
+      "step": 375
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.38566329561207197,
+      "learning_rate": 0.00019743063353122676,
+      "loss": 0.7452,
+      "step": 376
+    },
+    {
+      "epoch": 0.10053333333333334,
+      "grad_norm": 0.3898370403448482,
+      "learning_rate": 0.00019741114238146899,
+      "loss": 0.7206,
+      "step": 377
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.39569847947339876,
+      "learning_rate": 0.00019739157855056747,
+      "loss": 0.7104,
+      "step": 378
+    },
+    {
+      "epoch": 0.10106666666666667,
+      "grad_norm": 0.4720163610507204,
+      "learning_rate": 0.00019737194205311936,
+      "loss": 0.7244,
+      "step": 379
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.3860555716947627,
+      "learning_rate": 0.00019735223290377594,
+      "loss": 0.67,
+      "step": 380
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.39251393253669836,
+      "learning_rate": 0.0001973324511172428,
+      "loss": 0.7087,
+      "step": 381
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.42593556820946193,
+      "learning_rate": 0.0001973125967082797,
+      "loss": 0.6885,
+      "step": 382
+    },
+    {
+      "epoch": 0.10213333333333334,
+      "grad_norm": 0.40780829567432786,
+      "learning_rate": 0.00019729266969170049,
+      "loss": 0.711,
+      "step": 383
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4056697673225834,
+      "learning_rate": 0.00019727267008237334,
+      "loss": 0.7211,
+      "step": 384
+    },
+    {
+      "epoch": 0.10266666666666667,
+      "grad_norm": 0.39581626367201905,
+      "learning_rate": 0.00019725259789522045,
+      "loss": 0.7272,
+      "step": 385
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.373874531062953,
+      "learning_rate": 0.00019723245314521827,
+      "loss": 0.7052,
+      "step": 386
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.4068541260659157,
+      "learning_rate": 0.00019721223584739735,
+      "loss": 0.7115,
+      "step": 387
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.38118981432547044,
+      "learning_rate": 0.00019719194601684235,
+      "loss": 0.6784,
+      "step": 388
+    },
+    {
+      "epoch": 0.10373333333333333,
+      "grad_norm": 0.3881837421417207,
+      "learning_rate": 0.0001971715836686921,
+      "loss": 0.7183,
+      "step": 389
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.40651980511531366,
+      "learning_rate": 0.0001971511488181395,
+      "loss": 0.6528,
+      "step": 390
+    },
+    {
+      "epoch": 0.10426666666666666,
+      "grad_norm": 0.36871056090016163,
+      "learning_rate": 0.00019713064148043158,
+      "loss": 0.7222,
+      "step": 391
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.3948854284153273,
+      "learning_rate": 0.00019711006167086938,
+      "loss": 0.7109,
+      "step": 392
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.4027591206578172,
+      "learning_rate": 0.0001970894094048081,
+      "loss": 0.716,
+      "step": 393
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.36856288639039225,
+      "learning_rate": 0.00019706868469765695,
+      "loss": 0.6629,
+      "step": 394
+    },
+    {
+      "epoch": 0.10533333333333333,
+      "grad_norm": 0.40244812724478196,
+      "learning_rate": 0.00019704788756487926,
+      "loss": 0.6666,
+      "step": 395
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.40518224265258496,
+      "learning_rate": 0.00019702701802199227,
+      "loss": 0.7695,
+      "step": 396
+    },
+    {
+      "epoch": 0.10586666666666666,
+      "grad_norm": 0.39128058766537643,
+      "learning_rate": 0.00019700607608456733,
+      "loss": 0.7157,
+      "step": 397
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.3548081988054615,
+      "learning_rate": 0.00019698506176822988,
+      "loss": 0.7052,
+      "step": 398
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.37775388733574533,
+      "learning_rate": 0.00019696397508865918,
+      "loss": 0.6781,
+      "step": 399
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.3777432616655833,
+      "learning_rate": 0.00019694281606158864,
+      "loss": 0.694,
+      "step": 400
+    },
+    {
+      "epoch": 0.10693333333333334,
+      "grad_norm": 0.3817320084513074,
+      "learning_rate": 0.0001969215847028056,
+      "loss": 0.7158,
+      "step": 401
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.36780205270365646,
+      "learning_rate": 0.0001969002810281513,
+      "loss": 0.6478,
+      "step": 402
+    },
+    {
+      "epoch": 0.10746666666666667,
+      "grad_norm": 0.3575251301932596,
+      "learning_rate": 0.00019687890505352108,
+      "loss": 0.6592,
+      "step": 403
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.36015115408778114,
+      "learning_rate": 0.0001968574567948641,
+      "loss": 0.6652,
+      "step": 404
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.37514101701633856,
+      "learning_rate": 0.0001968359362681835,
+      "loss": 0.7163,
+      "step": 405
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.35848046775503867,
+      "learning_rate": 0.00019681434348953636,
+      "loss": 0.6813,
+      "step": 406
+    },
+    {
+      "epoch": 0.10853333333333333,
+      "grad_norm": 0.38195265703262476,
+      "learning_rate": 0.0001967926784750336,
+      "loss": 0.6695,
+      "step": 407
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.41766714739982413,
+      "learning_rate": 0.00019677094124084018,
+      "loss": 0.7335,
+      "step": 408
+    },
+    {
+      "epoch": 0.10906666666666667,
+      "grad_norm": 0.39122290353640826,
+      "learning_rate": 0.00019674913180317476,
+      "loss": 0.6989,
+      "step": 409
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.3974503199045698,
+      "learning_rate": 0.00019672725017831,
+      "loss": 0.6931,
+      "step": 410
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.3735666471227619,
+      "learning_rate": 0.0001967052963825724,
+      "loss": 0.6953,
+      "step": 411
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.4089716436278948,
+      "learning_rate": 0.00019668327043234225,
+      "loss": 0.7341,
+      "step": 412
+    },
+    {
+      "epoch": 0.11013333333333333,
+      "grad_norm": 0.38233747405220225,
+      "learning_rate": 0.00019666117234405376,
+      "loss": 0.6879,
+      "step": 413
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.3801004094705498,
+      "learning_rate": 0.0001966390021341949,
+      "loss": 0.673,
+      "step": 414
+    },
+    {
+      "epoch": 0.11066666666666666,
+      "grad_norm": 0.3852979399073058,
+      "learning_rate": 0.00019661675981930748,
+      "loss": 0.7,
+      "step": 415
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.36823664882709845,
+      "learning_rate": 0.0001965944454159871,
+      "loss": 0.638,
+      "step": 416
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.39215177508706645,
+      "learning_rate": 0.0001965720589408832,
+      "loss": 0.7268,
+      "step": 417
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.39177098212365996,
+      "learning_rate": 0.0001965496004106989,
+      "loss": 0.7345,
+      "step": 418
+    },
+    {
+      "epoch": 0.11173333333333334,
+      "grad_norm": 0.41500414564582316,
+      "learning_rate": 0.0001965270698421911,
+      "loss": 0.6647,
+      "step": 419
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4293857469972005,
+      "learning_rate": 0.00019650446725217056,
+      "loss": 0.7198,
+      "step": 420
+    },
+    {
+      "epoch": 0.11226666666666667,
+      "grad_norm": 0.5469028745667647,
+      "learning_rate": 0.00019648179265750165,
+      "loss": 0.7457,
+      "step": 421
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.389404673712786,
+      "learning_rate": 0.00019645904607510248,
+      "loss": 0.7336,
+      "step": 422
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.4632565101273264,
+      "learning_rate": 0.00019643622752194497,
+      "loss": 0.6786,
+      "step": 423
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.39670356476204083,
+      "learning_rate": 0.00019641333701505463,
+      "loss": 0.7391,
+      "step": 424
+    },
+    {
+      "epoch": 0.11333333333333333,
+      "grad_norm": 0.3879711271305278,
+      "learning_rate": 0.00019639037457151073,
+      "loss": 0.7339,
+      "step": 425
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.38946149940665215,
+      "learning_rate": 0.00019636734020844613,
+      "loss": 0.7366,
+      "step": 426
+    },
+    {
+      "epoch": 0.11386666666666667,
+      "grad_norm": 0.3769750012390701,
+      "learning_rate": 0.00019634423394304749,
+      "loss": 0.6679,
+      "step": 427
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.3941058498900877,
+      "learning_rate": 0.00019632105579255496,
+      "loss": 0.7171,
+      "step": 428
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.3942461239339633,
+      "learning_rate": 0.00019629780577426243,
+      "loss": 0.7284,
+      "step": 429
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.39236431276715966,
+      "learning_rate": 0.00019627448390551735,
+      "loss": 0.7804,
+      "step": 430
+    },
+    {
+      "epoch": 0.11493333333333333,
+      "grad_norm": 0.4247373584191937,
+      "learning_rate": 0.00019625109020372084,
+      "loss": 0.7512,
+      "step": 431
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.41224618842272703,
+      "learning_rate": 0.00019622762468632759,
+      "loss": 0.7288,
+      "step": 432
+    },
+    {
+      "epoch": 0.11546666666666666,
+      "grad_norm": 0.38269657954130515,
+      "learning_rate": 0.00019620408737084586,
+      "loss": 0.7338,
+      "step": 433
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.38041705459703384,
+      "learning_rate": 0.00019618047827483744,
+      "loss": 0.6806,
+      "step": 434
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.3527624598512677,
+      "learning_rate": 0.00019615679741591784,
+      "loss": 0.6883,
+      "step": 435
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.40031145378579214,
+      "learning_rate": 0.00019613304481175595,
+      "loss": 0.6551,
+      "step": 436
+    },
+    {
+      "epoch": 0.11653333333333334,
+      "grad_norm": 0.38611200230215986,
+      "learning_rate": 0.0001961092204800742,
+      "loss": 0.7388,
+      "step": 437
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.39393977207600733,
+      "learning_rate": 0.0001960853244386486,
+      "loss": 0.7083,
+      "step": 438
+    },
+    {
+      "epoch": 0.11706666666666667,
+      "grad_norm": 0.4090195353783212,
+      "learning_rate": 0.00019606135670530872,
+      "loss": 0.7175,
+      "step": 439
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.3936963270607546,
+      "learning_rate": 0.00019603731729793747,
+      "loss": 0.7228,
+      "step": 440
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.38046986789467907,
+      "learning_rate": 0.00019601320623447132,
+      "loss": 0.675,
+      "step": 441
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.3968625139483729,
+      "learning_rate": 0.00019598902353290022,
+      "loss": 0.6765,
+      "step": 442
+    },
+    {
+      "epoch": 0.11813333333333334,
+      "grad_norm": 0.42326442027237987,
+      "learning_rate": 0.00019596476921126757,
+      "loss": 0.7131,
+      "step": 443
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.388244436576597,
+      "learning_rate": 0.00019594044328767016,
+      "loss": 0.701,
+      "step": 444
+    },
+    {
+      "epoch": 0.11866666666666667,
+      "grad_norm": 0.4706560893775817,
+      "learning_rate": 0.00019591604578025825,
+      "loss": 0.7012,
+      "step": 445
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.3712409004617936,
+      "learning_rate": 0.00019589157670723547,
+      "loss": 0.7553,
+      "step": 446
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.3878336725075563,
+      "learning_rate": 0.0001958670360868589,
+      "loss": 0.7075,
+      "step": 447
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.37540072132025487,
+      "learning_rate": 0.00019584242393743897,
+      "loss": 0.7147,
+      "step": 448
+    },
+    {
+      "epoch": 0.11973333333333333,
+      "grad_norm": 0.38676224850073415,
+      "learning_rate": 0.00019581774027733947,
+      "loss": 0.6761,
+      "step": 449
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.3669138850766407,
+      "learning_rate": 0.00019579298512497758,
+      "loss": 0.765,
+      "step": 450
+    },
+    {
+      "epoch": 0.12026666666666666,
+      "grad_norm": 0.3613272862413039,
+      "learning_rate": 0.00019576815849882377,
+      "loss": 0.646,
+      "step": 451
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.38302150751257635,
+      "learning_rate": 0.0001957432604174019,
+      "loss": 0.6995,
+      "step": 452
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.43263743750584843,
+      "learning_rate": 0.0001957182908992891,
+      "loss": 0.7126,
+      "step": 453
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.4288227827429822,
+      "learning_rate": 0.00019569324996311584,
+      "loss": 0.7298,
+      "step": 454
+    },
+    {
+      "epoch": 0.12133333333333333,
+      "grad_norm": 0.3738395629251792,
+      "learning_rate": 0.00019566813762756584,
+      "loss": 0.7032,
+      "step": 455
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3985200360659549,
+      "learning_rate": 0.0001956429539113761,
+      "loss": 0.6797,
+      "step": 456
+    },
+    {
+      "epoch": 0.12186666666666666,
+      "grad_norm": 0.3778754358482348,
+      "learning_rate": 0.00019561769883333688,
+      "loss": 0.6595,
+      "step": 457
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.37786666063658525,
+      "learning_rate": 0.00019559237241229173,
+      "loss": 0.6953,
+      "step": 458
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.3789340648124684,
+      "learning_rate": 0.00019556697466713735,
+      "loss": 0.761,
+      "step": 459
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.3730473524126068,
+      "learning_rate": 0.00019554150561682372,
+      "loss": 0.7201,
+      "step": 460
+    },
+    {
+      "epoch": 0.12293333333333334,
+      "grad_norm": 0.3769221787949578,
+      "learning_rate": 0.000195515965280354,
+      "loss": 0.7414,
+      "step": 461
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.38519664938922243,
+      "learning_rate": 0.00019549035367678451,
+      "loss": 0.718,
+      "step": 462
+    },
+    {
+      "epoch": 0.12346666666666667,
+      "grad_norm": 0.3699775356024406,
+      "learning_rate": 0.00019546467082522483,
+      "loss": 0.721,
+      "step": 463
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.372762708466123,
+      "learning_rate": 0.00019543891674483766,
+      "loss": 0.6597,
+      "step": 464
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.36824938738061047,
+      "learning_rate": 0.0001954130914548387,
+      "loss": 0.709,
+      "step": 465
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.4107032479970567,
+      "learning_rate": 0.00019538719497449707,
+      "loss": 0.7142,
+      "step": 466
+    },
+    {
+      "epoch": 0.12453333333333333,
+      "grad_norm": 0.4254004625795781,
+      "learning_rate": 0.00019536122732313475,
+      "loss": 0.7167,
+      "step": 467
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.40441743219413473,
+      "learning_rate": 0.00019533518852012693,
+      "loss": 0.7152,
+      "step": 468
+    },
+    {
+      "epoch": 0.12506666666666666,
+      "grad_norm": 0.39060382177465347,
+      "learning_rate": 0.00019530907858490191,
+      "loss": 0.7042,
+      "step": 469
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.3817381243273451,
+      "learning_rate": 0.00019528289753694108,
+      "loss": 0.7459,
+      "step": 470
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.3967028651412326,
+      "learning_rate": 0.00019525664539577875,
+      "loss": 0.6417,
+      "step": 471
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.3939787020778525,
+      "learning_rate": 0.0001952303221810024,
+      "loss": 0.6592,
+      "step": 472
+    },
+    {
+      "epoch": 0.12613333333333332,
+      "grad_norm": 0.4100587751073264,
+      "learning_rate": 0.00019520392791225254,
+      "loss": 0.6638,
+      "step": 473
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.3986654381090219,
+      "learning_rate": 0.0001951774626092226,
+      "loss": 0.7116,
+      "step": 474
+    },
+    {
+      "epoch": 0.12666666666666668,
+      "grad_norm": 0.42074501307535356,
+      "learning_rate": 0.0001951509262916591,
+      "loss": 0.7053,
+      "step": 475
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.38149676264179544,
+      "learning_rate": 0.00019512431897936156,
+      "loss": 0.7113,
+      "step": 476
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.4189460603285744,
+      "learning_rate": 0.0001950976406921824,
+      "loss": 0.688,
+      "step": 477
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.399465157569357,
+      "learning_rate": 0.000195070891450027,
+      "loss": 0.7806,
+      "step": 478
+    },
+    {
+      "epoch": 0.12773333333333334,
+      "grad_norm": 0.3874773618021789,
+      "learning_rate": 0.00019504407127285376,
+      "loss": 0.7193,
+      "step": 479
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.39690339102423877,
+      "learning_rate": 0.00019501718018067395,
+      "loss": 0.6713,
+      "step": 480
+    },
+    {
+      "epoch": 0.12826666666666667,
+      "grad_norm": 0.3912036625403481,
+      "learning_rate": 0.0001949902181935517,
+      "loss": 0.7316,
+      "step": 481
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.39128560548342606,
+      "learning_rate": 0.0001949631853316041,
+      "loss": 0.6494,
+      "step": 482
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.37267424886413625,
+      "learning_rate": 0.0001949360816150012,
+      "loss": 0.6618,
+      "step": 483
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.446165209883545,
+      "learning_rate": 0.00019490890706396575,
+      "loss": 0.7394,
+      "step": 484
+    },
+    {
+      "epoch": 0.12933333333333333,
+      "grad_norm": 0.3772600031543454,
+      "learning_rate": 0.0001948816616987735,
+      "loss": 0.7313,
+      "step": 485
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.36533180458150905,
+      "learning_rate": 0.0001948543455397529,
+      "loss": 0.6811,
+      "step": 486
+    },
+    {
+      "epoch": 0.12986666666666666,
+      "grad_norm": 0.38475823901985934,
+      "learning_rate": 0.00019482695860728531,
+      "loss": 0.6924,
+      "step": 487
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.3959141517349,
+      "learning_rate": 0.0001947995009218049,
+      "loss": 0.6533,
+      "step": 488
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.3615375786062136,
+      "learning_rate": 0.0001947719725037986,
+      "loss": 0.6732,
+      "step": 489
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.38276022814175553,
+      "learning_rate": 0.00019474437337380608,
+      "loss": 0.7338,
+      "step": 490
+    },
+    {
+      "epoch": 0.13093333333333335,
+      "grad_norm": 0.3924131689767105,
+      "learning_rate": 0.00019471670355241988,
+      "loss": 0.7005,
+      "step": 491
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.34943802477738917,
+      "learning_rate": 0.00019468896306028518,
+      "loss": 0.6368,
+      "step": 492
+    },
+    {
+      "epoch": 0.13146666666666668,
+      "grad_norm": 0.3811001180617372,
+      "learning_rate": 0.00019466115191809995,
+      "loss": 0.6497,
+      "step": 493
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.39204351851599806,
+      "learning_rate": 0.00019463327014661484,
+      "loss": 0.6287,
+      "step": 494
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.39848918656273674,
+      "learning_rate": 0.00019460531776663317,
+      "loss": 0.7274,
+      "step": 495
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.3968669072852806,
+      "learning_rate": 0.00019457729479901103,
+      "loss": 0.7507,
+      "step": 496
+    },
+    {
+      "epoch": 0.13253333333333334,
+      "grad_norm": 0.39250779593378327,
+      "learning_rate": 0.00019454920126465715,
+      "loss": 0.7157,
+      "step": 497
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.3596616176352458,
+      "learning_rate": 0.00019452103718453284,
+      "loss": 0.6984,
+      "step": 498
+    },
+    {
+      "epoch": 0.13306666666666667,
+      "grad_norm": 0.37092589890152405,
+      "learning_rate": 0.0001944928025796521,
+      "loss": 0.6931,
+      "step": 499
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.3997701138574225,
+      "learning_rate": 0.0001944644974710816,
+      "loss": 0.6404,
+      "step": 500
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.39686760159172485,
+      "learning_rate": 0.00019443612187994053,
+      "loss": 0.6973,
+      "step": 501
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.377371434441508,
+      "learning_rate": 0.00019440767582740067,
+      "loss": 0.6453,
+      "step": 502
+    },
+    {
+      "epoch": 0.13413333333333333,
+      "grad_norm": 0.3725684843333112,
+      "learning_rate": 0.00019437915933468648,
+      "loss": 0.6715,
+      "step": 503
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.36606500678021225,
+      "learning_rate": 0.0001943505724230748,
+      "loss": 0.6707,
+      "step": 504
+    },
+    {
+      "epoch": 0.13466666666666666,
+      "grad_norm": 0.3661108023226464,
+      "learning_rate": 0.0001943219151138952,
+      "loss": 0.6641,
+      "step": 505
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.3732834449322749,
+      "learning_rate": 0.00019429318742852968,
+      "loss": 0.7234,
+      "step": 506
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.3893008992826764,
+      "learning_rate": 0.00019426438938841277,
+      "loss": 0.7005,
+      "step": 507
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.3687076104728676,
+      "learning_rate": 0.00019423552101503142,
+      "loss": 0.7207,
+      "step": 508
+    },
+    {
+      "epoch": 0.13573333333333334,
+      "grad_norm": 0.38796754942956857,
+      "learning_rate": 0.00019420658232992518,
+      "loss": 0.7281,
+      "step": 509
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.3879548636092806,
+      "learning_rate": 0.00019417757335468596,
+      "loss": 0.6998,
+      "step": 510
+    },
+    {
+      "epoch": 0.13626666666666667,
+      "grad_norm": 0.38507133760254353,
+      "learning_rate": 0.0001941484941109582,
+      "loss": 0.6637,
+      "step": 511
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.38482947061663325,
+      "learning_rate": 0.00019411934462043872,
+      "loss": 0.7342,
+      "step": 512
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.3909204374564946,
+      "learning_rate": 0.00019409012490487668,
+      "loss": 0.7501,
+      "step": 513
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.37049107992470565,
+      "learning_rate": 0.00019406083498607385,
+      "loss": 0.6687,
+      "step": 514
+    },
+    {
+      "epoch": 0.13733333333333334,
+      "grad_norm": 0.3730924397110541,
+      "learning_rate": 0.00019403147488588414,
+      "loss": 0.7122,
+      "step": 515
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.3541371004502841,
+      "learning_rate": 0.000194002044626214,
+      "loss": 0.7239,
+      "step": 516
+    },
+    {
+      "epoch": 0.13786666666666667,
+      "grad_norm": 0.40978986375192183,
+      "learning_rate": 0.00019397254422902206,
+      "loss": 0.7464,
+      "step": 517
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.41531228006812554,
+      "learning_rate": 0.00019394297371631952,
+      "loss": 0.7357,
+      "step": 518
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.3776876049266166,
+      "learning_rate": 0.00019391333311016967,
+      "loss": 0.6312,
+      "step": 519
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.44763687900673493,
+      "learning_rate": 0.00019388362243268824,
+      "loss": 0.7614,
+      "step": 520
+    },
+    {
+      "epoch": 0.13893333333333333,
+      "grad_norm": 0.3722603187197382,
+      "learning_rate": 0.0001938538417060431,
+      "loss": 0.7074,
+      "step": 521
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.3672755771023005,
+      "learning_rate": 0.00019382399095245454,
+      "loss": 0.7071,
+      "step": 522
+    },
+    {
+      "epoch": 0.13946666666666666,
+      "grad_norm": 0.3722366282524974,
+      "learning_rate": 0.000193794070194195,
+      "loss": 0.6997,
+      "step": 523
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.3738321181849173,
+      "learning_rate": 0.0001937640794535892,
+      "loss": 0.6596,
+      "step": 524
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.39904992894995667,
+      "learning_rate": 0.00019373401875301407,
+      "loss": 0.6974,
+      "step": 525
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.37494149466913873,
+      "learning_rate": 0.00019370388811489872,
+      "loss": 0.7086,
+      "step": 526
+    },
+    {
+      "epoch": 0.14053333333333334,
+      "grad_norm": 0.37347478013109475,
+      "learning_rate": 0.00019367368756172443,
+      "loss": 0.7236,
+      "step": 527
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.34784329622429616,
+      "learning_rate": 0.0001936434171160247,
+      "loss": 0.6935,
+      "step": 528
+    },
+    {
+      "epoch": 0.14106666666666667,
+      "grad_norm": 0.3750156617703292,
+      "learning_rate": 0.00019361307680038517,
+      "loss": 0.739,
+      "step": 529
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.37232606672390345,
+      "learning_rate": 0.0001935826666374435,
+      "loss": 0.6925,
+      "step": 530
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.39762486938390995,
+      "learning_rate": 0.0001935521866498896,
+      "loss": 0.7205,
+      "step": 531
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.38044493859724926,
+      "learning_rate": 0.00019352163686046545,
+      "loss": 0.6796,
+      "step": 532
+    },
+    {
+      "epoch": 0.14213333333333333,
+      "grad_norm": 0.39295855919122624,
+      "learning_rate": 0.00019349101729196507,
+      "loss": 0.6914,
+      "step": 533
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.45628995595205124,
+      "learning_rate": 0.00019346032796723454,
+      "loss": 0.6757,
+      "step": 534
+    },
+    {
+      "epoch": 0.14266666666666666,
+      "grad_norm": 0.37460111359597603,
+      "learning_rate": 0.00019342956890917209,
+      "loss": 0.7017,
+      "step": 535
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.38274088967527936,
+      "learning_rate": 0.00019339874014072782,
+      "loss": 0.7232,
+      "step": 536
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.3722286636113766,
+      "learning_rate": 0.00019336784168490396,
+      "loss": 0.7645,
+      "step": 537
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.39360083837185894,
+      "learning_rate": 0.00019333687356475472,
+      "loss": 0.6773,
+      "step": 538
+    },
+    {
+      "epoch": 0.14373333333333332,
+      "grad_norm": 0.4105223572181332,
+      "learning_rate": 0.00019330583580338622,
+      "loss": 0.7111,
+      "step": 539
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3927114267160294,
+      "learning_rate": 0.00019327472842395666,
+      "loss": 0.6867,
+      "step": 540
+    },
+    {
+      "epoch": 0.14426666666666665,
+      "grad_norm": 0.3767313755097996,
+      "learning_rate": 0.00019324355144967605,
+      "loss": 0.7062,
+      "step": 541
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.3877264118901853,
+      "learning_rate": 0.00019321230490380642,
+      "loss": 0.6426,
+      "step": 542
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.3760745762306027,
+      "learning_rate": 0.00019318098880966172,
+      "loss": 0.7041,
+      "step": 543
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.3878799886567483,
+      "learning_rate": 0.00019314960319060767,
+      "loss": 0.6825,
+      "step": 544
+    },
+    {
+      "epoch": 0.14533333333333334,
+      "grad_norm": 0.3798409262367505,
+      "learning_rate": 0.00019311814807006198,
+      "loss": 0.7019,
+      "step": 545
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4760867935296923,
+      "learning_rate": 0.00019308662347149421,
+      "loss": 0.7003,
+      "step": 546
+    },
+    {
+      "epoch": 0.14586666666666667,
+      "grad_norm": 0.36622394318646834,
+      "learning_rate": 0.00019305502941842573,
+      "loss": 0.7382,
+      "step": 547
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.35870682999037357,
+      "learning_rate": 0.00019302336593442972,
+      "loss": 0.6733,
+      "step": 548
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.3977432873452541,
+      "learning_rate": 0.00019299163304313118,
+      "loss": 0.7252,
+      "step": 549
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.3818882114328452,
+      "learning_rate": 0.00019295983076820687,
+      "loss": 0.6856,
+      "step": 550
+    },
+    {
+      "epoch": 0.14693333333333333,
+      "grad_norm": 0.39394746748151416,
+      "learning_rate": 0.00019292795913338542,
+      "loss": 0.7047,
+      "step": 551
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.36698869326363837,
+      "learning_rate": 0.00019289601816244707,
+      "loss": 0.6528,
+      "step": 552
+    },
+    {
+      "epoch": 0.14746666666666666,
+      "grad_norm": 0.3730057549400165,
+      "learning_rate": 0.0001928640078792239,
+      "loss": 0.6872,
+      "step": 553
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.38174600345017096,
+      "learning_rate": 0.0001928319283075996,
+      "loss": 0.6967,
+      "step": 554
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.3643558499171911,
+      "learning_rate": 0.0001927997794715097,
+      "loss": 0.6972,
+      "step": 555
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.3871930364023789,
+      "learning_rate": 0.00019276756139494132,
+      "loss": 0.6772,
+      "step": 556
+    },
+    {
+      "epoch": 0.14853333333333332,
+      "grad_norm": 0.4107975484164306,
+      "learning_rate": 0.00019273527410193324,
+      "loss": 0.7113,
+      "step": 557
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.3725140955042874,
+      "learning_rate": 0.0001927029176165759,
+      "loss": 0.6494,
+      "step": 558
+    },
+    {
+      "epoch": 0.14906666666666665,
+      "grad_norm": 0.3860544287407251,
+      "learning_rate": 0.00019267049196301135,
+      "loss": 0.6827,
+      "step": 559
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.34947671081582554,
+      "learning_rate": 0.00019263799716543335,
+      "loss": 0.6748,
+      "step": 560
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.35548483210574205,
+      "learning_rate": 0.00019260543324808705,
+      "loss": 0.7127,
+      "step": 561
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.3508317019595235,
+      "learning_rate": 0.00019257280023526936,
+      "loss": 0.6872,
+      "step": 562
+    },
+    {
+      "epoch": 0.15013333333333334,
+      "grad_norm": 0.3641977172837662,
+      "learning_rate": 0.00019254009815132864,
+      "loss": 0.69,
+      "step": 563
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3862822763071805,
+      "learning_rate": 0.00019250732702066488,
+      "loss": 0.6951,
+      "step": 564
+    },
+    {
+      "epoch": 0.15066666666666667,
+      "grad_norm": 0.40894636273487134,
+      "learning_rate": 0.00019247448686772944,
+      "loss": 0.6514,
+      "step": 565
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.3693964312769452,
+      "learning_rate": 0.00019244157771702532,
+      "loss": 0.6528,
+      "step": 566
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.3728179985163386,
+      "learning_rate": 0.0001924085995931069,
+      "loss": 0.6728,
+      "step": 567
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.37524843842900835,
+      "learning_rate": 0.00019237555252058015,
+      "loss": 0.6731,
+      "step": 568
+    },
+    {
+      "epoch": 0.15173333333333333,
+      "grad_norm": 0.3650361301148371,
+      "learning_rate": 0.00019234243652410232,
+      "loss": 0.6677,
+      "step": 569
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.38124744091028967,
+      "learning_rate": 0.0001923092516283822,
+      "loss": 0.6578,
+      "step": 570
+    },
+    {
+      "epoch": 0.15226666666666666,
+      "grad_norm": 0.37174150058188726,
+      "learning_rate": 0.00019227599785817998,
+      "loss": 0.6614,
+      "step": 571
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.3658327860581027,
+      "learning_rate": 0.00019224267523830716,
+      "loss": 0.7255,
+      "step": 572
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.36609790952934385,
+      "learning_rate": 0.00019220928379362672,
+      "loss": 0.7278,
+      "step": 573
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.37194123384017186,
+      "learning_rate": 0.00019217582354905295,
+      "loss": 0.6975,
+      "step": 574
+    },
+    {
+      "epoch": 0.15333333333333332,
+      "grad_norm": 0.37098255942659086,
+      "learning_rate": 0.0001921422945295514,
+      "loss": 0.6824,
+      "step": 575
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.39104178693675107,
+      "learning_rate": 0.00019210869676013906,
+      "loss": 0.7462,
+      "step": 576
+    },
+    {
+      "epoch": 0.15386666666666668,
+      "grad_norm": 0.3647716235710022,
+      "learning_rate": 0.00019207503026588406,
+      "loss": 0.7252,
+      "step": 577
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.3834933785003577,
+      "learning_rate": 0.00019204129507190604,
+      "loss": 0.7185,
+      "step": 578
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.37896481990941155,
+      "learning_rate": 0.00019200749120337567,
+      "loss": 0.7405,
+      "step": 579
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.3736305411570718,
+      "learning_rate": 0.00019197361868551494,
+      "loss": 0.723,
+      "step": 580
+    },
+    {
+      "epoch": 0.15493333333333334,
+      "grad_norm": 0.3806871543208302,
+      "learning_rate": 0.00019193967754359715,
+      "loss": 0.6891,
+      "step": 581
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.3932025476034644,
+      "learning_rate": 0.00019190566780294662,
+      "loss": 0.67,
+      "step": 582
+    },
+    {
+      "epoch": 0.15546666666666667,
+      "grad_norm": 0.37005181168015094,
+      "learning_rate": 0.000191871589488939,
+      "loss": 0.6708,
+      "step": 583
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.37698964774570576,
+      "learning_rate": 0.00019183744262700112,
+      "loss": 0.6982,
+      "step": 584
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.39283562495456964,
+      "learning_rate": 0.00019180322724261082,
+      "loss": 0.6928,
+      "step": 585
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.3847267098844669,
+      "learning_rate": 0.00019176894336129716,
+      "loss": 0.6919,
+      "step": 586
+    },
+    {
+      "epoch": 0.15653333333333333,
+      "grad_norm": 0.36784451810982993,
+      "learning_rate": 0.00019173459100864032,
+      "loss": 0.6739,
+      "step": 587
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4208591638419817,
+      "learning_rate": 0.00019170017021027152,
+      "loss": 0.744,
+      "step": 588
+    },
+    {
+      "epoch": 0.15706666666666666,
+      "grad_norm": 0.35266156318845115,
+      "learning_rate": 0.00019166568099187304,
+      "loss": 0.6549,
+      "step": 589
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.38803183828469207,
+      "learning_rate": 0.0001916311233791783,
+      "loss": 0.6723,
+      "step": 590
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.3624907180591766,
+      "learning_rate": 0.00019159649739797162,
+      "loss": 0.6929,
+      "step": 591
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.3832990684783281,
+      "learning_rate": 0.00019156180307408846,
+      "loss": 0.7006,
+      "step": 592
+    },
+    {
+      "epoch": 0.15813333333333332,
+      "grad_norm": 0.3849305246144806,
+      "learning_rate": 0.0001915270404334152,
+      "loss": 0.6389,
+      "step": 593
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.3846659635172018,
+      "learning_rate": 0.00019149220950188917,
+      "loss": 0.7154,
+      "step": 594
+    },
+    {
+      "epoch": 0.15866666666666668,
+      "grad_norm": 0.39216567763845006,
+      "learning_rate": 0.0001914573103054987,
+      "loss": 0.7012,
+      "step": 595
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.38246961098197996,
+      "learning_rate": 0.00019142234287028312,
+      "loss": 0.6521,
+      "step": 596
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.3704981314071925,
+      "learning_rate": 0.00019138730722233248,
+      "loss": 0.6531,
+      "step": 597
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.3899946091619324,
+      "learning_rate": 0.00019135220338778797,
+      "loss": 0.693,
+      "step": 598
+    },
+    {
+      "epoch": 0.15973333333333334,
+      "grad_norm": 0.3672076618385661,
+      "learning_rate": 0.0001913170313928414,
+      "loss": 0.6759,
+      "step": 599
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.40260111828210965,
+      "learning_rate": 0.00019128179126373567,
+      "loss": 0.7136,
+      "step": 600
+    },
+    {
+      "epoch": 0.16026666666666667,
+      "grad_norm": 0.3959889909912464,
+      "learning_rate": 0.00019124648302676434,
+      "loss": 0.7347,
+      "step": 601
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.37393190727411146,
+      "learning_rate": 0.00019121110670827193,
+      "loss": 0.6708,
+      "step": 602
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.37990440438717593,
+      "learning_rate": 0.00019117566233465362,
+      "loss": 0.6772,
+      "step": 603
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.3990101216733403,
+      "learning_rate": 0.00019114014993235553,
+      "loss": 0.7077,
+      "step": 604
+    },
+    {
+      "epoch": 0.16133333333333333,
+      "grad_norm": 0.36806391265789873,
+      "learning_rate": 0.00019110456952787432,
+      "loss": 0.6617,
+      "step": 605
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.3680440018563989,
+      "learning_rate": 0.00019106892114775762,
+      "loss": 0.6736,
+      "step": 606
+    },
+    {
+      "epoch": 0.16186666666666666,
+      "grad_norm": 0.3770635583222423,
+      "learning_rate": 0.0001910332048186036,
+      "loss": 0.6786,
+      "step": 607
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.36781987369544633,
+      "learning_rate": 0.00019099742056706123,
+      "loss": 0.6597,
+      "step": 608
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.4188436166773392,
+      "learning_rate": 0.00019096156841983013,
+      "loss": 0.6706,
+      "step": 609
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.3974136941701652,
+      "learning_rate": 0.00019092564840366056,
+      "loss": 0.7326,
+      "step": 610
+    },
+    {
+      "epoch": 0.16293333333333335,
+      "grad_norm": 0.3688977129953491,
+      "learning_rate": 0.0001908896605453535,
+      "loss": 0.6486,
+      "step": 611
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.36998873237097046,
+      "learning_rate": 0.00019085360487176037,
+      "loss": 0.6528,
+      "step": 612
+    },
+    {
+      "epoch": 0.16346666666666668,
+      "grad_norm": 0.3725068864338746,
+      "learning_rate": 0.0001908174814097834,
+      "loss": 0.6985,
+      "step": 613
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.37511559165531033,
+      "learning_rate": 0.00019078129018637528,
+      "loss": 0.6815,
+      "step": 614
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.3761365006708317,
+      "learning_rate": 0.00019074503122853924,
+      "loss": 0.6695,
+      "step": 615
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.37067365233924277,
+      "learning_rate": 0.00019070870456332914,
+      "loss": 0.6804,
+      "step": 616
+    },
+    {
+      "epoch": 0.16453333333333334,
+      "grad_norm": 0.33584045515665806,
+      "learning_rate": 0.00019067231021784929,
+      "loss": 0.616,
+      "step": 617
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.37880782554632325,
+      "learning_rate": 0.0001906358482192545,
+      "loss": 0.6822,
+      "step": 618
+    },
+    {
+      "epoch": 0.16506666666666667,
+      "grad_norm": 0.36710286954036647,
+      "learning_rate": 0.00019059931859475012,
+      "loss": 0.6798,
+      "step": 619
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.38456640408824266,
+      "learning_rate": 0.00019056272137159188,
+      "loss": 0.6911,
+      "step": 620
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.43076307050161555,
+      "learning_rate": 0.00019052605657708595,
+      "loss": 0.7397,
+      "step": 621
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.3843910559308764,
+      "learning_rate": 0.000190489324238589,
+      "loss": 0.765,
+      "step": 622
+    },
+    {
+      "epoch": 0.16613333333333333,
+      "grad_norm": 0.35023931610344894,
+      "learning_rate": 0.00019045252438350802,
+      "loss": 0.6891,
+      "step": 623
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3684146530025913,
+      "learning_rate": 0.0001904156570393004,
+      "loss": 0.6723,
+      "step": 624
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.37507060892962285,
+      "learning_rate": 0.00019037872223347387,
+      "loss": 0.7164,
+      "step": 625
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.3862607727860442,
+      "learning_rate": 0.00019034171999358655,
+      "loss": 0.7107,
+      "step": 626
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.39292680630064536,
+      "learning_rate": 0.00019030465034724676,
+      "loss": 0.7351,
+      "step": 627
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.38247952716548084,
+      "learning_rate": 0.00019026751332211324,
+      "loss": 0.693,
+      "step": 628
+    },
+    {
+      "epoch": 0.16773333333333335,
+      "grad_norm": 0.39190391476563474,
+      "learning_rate": 0.00019023030894589496,
+      "loss": 0.701,
+      "step": 629
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.3685822952964325,
+      "learning_rate": 0.0001901930372463511,
+      "loss": 0.697,
+      "step": 630
+    },
+    {
+      "epoch": 0.16826666666666668,
+      "grad_norm": 0.390305551735311,
+      "learning_rate": 0.0001901556982512911,
+      "loss": 0.6797,
+      "step": 631
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.40394755438945706,
+      "learning_rate": 0.00019011829198857467,
+      "loss": 0.6542,
+      "step": 632
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.3641097467415087,
+      "learning_rate": 0.0001900808184861116,
+      "loss": 0.6735,
+      "step": 633
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.38171692349208125,
+      "learning_rate": 0.00019004327777186192,
+      "loss": 0.7438,
+      "step": 634
+    },
+    {
+      "epoch": 0.16933333333333334,
+      "grad_norm": 0.364935435068997,
+      "learning_rate": 0.00019000566987383583,
+      "loss": 0.7249,
+      "step": 635
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3745818024901291,
+      "learning_rate": 0.00018996799482009352,
+      "loss": 0.703,
+      "step": 636
+    },
+    {
+      "epoch": 0.16986666666666667,
+      "grad_norm": 0.3816239881383332,
+      "learning_rate": 0.0001899302526387455,
+      "loss": 0.6274,
+      "step": 637
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.37760949844648883,
+      "learning_rate": 0.00018989244335795223,
+      "loss": 0.6202,
+      "step": 638
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.3991502273105771,
+      "learning_rate": 0.0001898545670059242,
+      "loss": 0.7343,
+      "step": 639
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.4644524349583215,
+      "learning_rate": 0.00018981662361092206,
+      "loss": 0.7537,
+      "step": 640
+    },
+    {
+      "epoch": 0.17093333333333333,
+      "grad_norm": 0.3743258046392929,
+      "learning_rate": 0.0001897786132012564,
+      "loss": 0.7124,
+      "step": 641
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.37129701740053306,
+      "learning_rate": 0.00018974053580528784,
+      "loss": 0.6693,
+      "step": 642
+    },
+    {
+      "epoch": 0.17146666666666666,
+      "grad_norm": 0.3976736544219594,
+      "learning_rate": 0.000189702391451427,
+      "loss": 0.7239,
+      "step": 643
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.40436774671596665,
+      "learning_rate": 0.00018966418016813443,
+      "loss": 0.758,
+      "step": 644
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.39558895992871945,
+      "learning_rate": 0.00018962590198392057,
+      "loss": 0.7048,
+      "step": 645
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.4039053861116352,
+      "learning_rate": 0.0001895875569273459,
+      "loss": 0.6667,
+      "step": 646
+    },
+    {
+      "epoch": 0.17253333333333334,
+      "grad_norm": 0.3832393253577056,
+      "learning_rate": 0.00018954914502702068,
+      "loss": 0.7354,
+      "step": 647
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.39840124059988224,
+      "learning_rate": 0.00018951066631160511,
+      "loss": 0.7019,
+      "step": 648
+    },
+    {
+      "epoch": 0.17306666666666667,
+      "grad_norm": 0.3821003277970343,
+      "learning_rate": 0.0001894721208098092,
+      "loss": 0.655,
+      "step": 649
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.4031718649940101,
+      "learning_rate": 0.00018943350855039285,
+      "loss": 0.6894,
+      "step": 650
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.38452117467402436,
+      "learning_rate": 0.00018939482956216572,
+      "loss": 0.7054,
+      "step": 651
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.37693538774583135,
+      "learning_rate": 0.00018935608387398727,
+      "loss": 0.6349,
+      "step": 652
+    },
+    {
+      "epoch": 0.17413333333333333,
+      "grad_norm": 0.3697648619397049,
+      "learning_rate": 0.00018931727151476671,
+      "loss": 0.6857,
+      "step": 653
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.40125499460001507,
+      "learning_rate": 0.00018927839251346303,
+      "loss": 0.7577,
+      "step": 654
+    },
+    {
+      "epoch": 0.17466666666666666,
+      "grad_norm": 0.35724768359663395,
+      "learning_rate": 0.00018923944689908494,
+      "loss": 0.6452,
+      "step": 655
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.3723289881457659,
+      "learning_rate": 0.00018920043470069077,
+      "loss": 0.6977,
+      "step": 656
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.37662720192702304,
+      "learning_rate": 0.0001891613559473887,
+      "loss": 0.6155,
+      "step": 657
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.39095978551764027,
+      "learning_rate": 0.0001891222106683364,
+      "loss": 0.6831,
+      "step": 658
+    },
+    {
+      "epoch": 0.17573333333333332,
+      "grad_norm": 0.3953272394697602,
+      "learning_rate": 0.00018908299889274128,
+      "loss": 0.7185,
+      "step": 659
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.38921264053878885,
+      "learning_rate": 0.0001890437206498603,
+      "loss": 0.6773,
+      "step": 660
+    },
+    {
+      "epoch": 0.17626666666666665,
+      "grad_norm": 0.4363970184028858,
+      "learning_rate": 0.00018900437596900007,
+      "loss": 0.7083,
+      "step": 661
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.35566662636829927,
+      "learning_rate": 0.0001889649648795167,
+      "loss": 0.6541,
+      "step": 662
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.35499374433455266,
+      "learning_rate": 0.00018892548741081592,
+      "loss": 0.6732,
+      "step": 663
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.3798882778924792,
+      "learning_rate": 0.00018888594359235295,
+      "loss": 0.7193,
+      "step": 664
+    },
+    {
+      "epoch": 0.17733333333333334,
+      "grad_norm": 0.37463819001129894,
+      "learning_rate": 0.00018884633345363257,
+      "loss": 0.7098,
+      "step": 665
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.3707520243831594,
+      "learning_rate": 0.00018880665702420893,
+      "loss": 0.6652,
+      "step": 666
+    },
+    {
+      "epoch": 0.17786666666666667,
+      "grad_norm": 0.38359257838556565,
+      "learning_rate": 0.00018876691433368577,
+      "loss": 0.7044,
+      "step": 667
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.3498781616388173,
+      "learning_rate": 0.00018872710541171614,
+      "loss": 0.6999,
+      "step": 668
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.3493419838565732,
+      "learning_rate": 0.00018868723028800263,
+      "loss": 0.6425,
+      "step": 669
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.36423931168153995,
+      "learning_rate": 0.00018864728899229717,
+      "loss": 0.6733,
+      "step": 670
+    },
+    {
+      "epoch": 0.17893333333333333,
+      "grad_norm": 0.36980669313173353,
+      "learning_rate": 0.00018860728155440106,
+      "loss": 0.6896,
+      "step": 671
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3565752863470386,
+      "learning_rate": 0.00018856720800416494,
+      "loss": 0.672,
+      "step": 672
+    },
+    {
+      "epoch": 0.17946666666666666,
+      "grad_norm": 0.35198793277813784,
+      "learning_rate": 0.0001885270683714888,
+      "loss": 0.6548,
+      "step": 673
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.37550168014656987,
+      "learning_rate": 0.00018848686268632193,
+      "loss": 0.6914,
+      "step": 674
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.37318313956676913,
+      "learning_rate": 0.0001884465909786629,
+      "loss": 0.7396,
+      "step": 675
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.37129209587571155,
+      "learning_rate": 0.0001884062532785595,
+      "loss": 0.7052,
+      "step": 676
+    },
+    {
+      "epoch": 0.18053333333333332,
+      "grad_norm": 0.3756634716439558,
+      "learning_rate": 0.00018836584961610887,
+      "loss": 0.6628,
+      "step": 677
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.3685808183977085,
+      "learning_rate": 0.00018832538002145727,
+      "loss": 0.6351,
+      "step": 678
+    },
+    {
+      "epoch": 0.18106666666666665,
+      "grad_norm": 0.3585546951446435,
+      "learning_rate": 0.00018828484452480023,
+      "loss": 0.7001,
+      "step": 679
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.3635651527327575,
+      "learning_rate": 0.00018824424315638233,
+      "loss": 0.6577,
+      "step": 680
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.4062132118445034,
+      "learning_rate": 0.00018820357594649738,
+      "loss": 0.6699,
+      "step": 681
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.39259652521345406,
+      "learning_rate": 0.00018816284292548833,
+      "loss": 0.688,
+      "step": 682
+    },
+    {
+      "epoch": 0.18213333333333334,
+      "grad_norm": 0.4034976908204871,
+      "learning_rate": 0.00018812204412374723,
+      "loss": 0.6796,
+      "step": 683
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.35753477880743434,
+      "learning_rate": 0.00018808117957171518,
+      "loss": 0.6556,
+      "step": 684
+    },
+    {
+      "epoch": 0.18266666666666667,
+      "grad_norm": 0.3773797963092146,
+      "learning_rate": 0.00018804024929988233,
+      "loss": 0.7251,
+      "step": 685
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.3937096142107768,
+      "learning_rate": 0.0001879992533387879,
+      "loss": 0.7266,
+      "step": 686
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.3559842332949686,
+      "learning_rate": 0.00018795819171902014,
+      "loss": 0.7054,
+      "step": 687
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.38416397544317316,
+      "learning_rate": 0.00018791706447121622,
+      "loss": 0.6845,
+      "step": 688
+    },
+    {
+      "epoch": 0.18373333333333333,
+      "grad_norm": 0.37566552918129953,
+      "learning_rate": 0.00018787587162606231,
+      "loss": 0.723,
+      "step": 689
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.3854055545132654,
+      "learning_rate": 0.00018783461321429353,
+      "loss": 0.7013,
+      "step": 690
+    },
+    {
+      "epoch": 0.18426666666666666,
+      "grad_norm": 0.38311731286896406,
+      "learning_rate": 0.00018779328926669397,
+      "loss": 0.7069,
+      "step": 691
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.3829186839362291,
+      "learning_rate": 0.00018775189981409652,
+      "loss": 0.6708,
+      "step": 692
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.38912545616839017,
+      "learning_rate": 0.00018771044488738299,
+      "loss": 0.694,
+      "step": 693
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.3933253628815094,
+      "learning_rate": 0.00018766892451748407,
+      "loss": 0.7311,
+      "step": 694
+    },
+    {
+      "epoch": 0.18533333333333332,
+      "grad_norm": 0.3765880474272644,
+      "learning_rate": 0.0001876273387353793,
+      "loss": 0.6928,
+      "step": 695
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3772820075421581,
+      "learning_rate": 0.00018758568757209685,
+      "loss": 0.6824,
+      "step": 696
+    },
+    {
+      "epoch": 0.18586666666666668,
+      "grad_norm": 0.3729809873038056,
+      "learning_rate": 0.0001875439710587139,
+      "loss": 0.6719,
+      "step": 697
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.37035412471318624,
+      "learning_rate": 0.00018750218922635633,
+      "loss": 0.6914,
+      "step": 698
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.3801581932418795,
+      "learning_rate": 0.0001874603421061986,
+      "loss": 0.6536,
+      "step": 699
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.3885577142479478,
+      "learning_rate": 0.0001874184297294641,
+      "loss": 0.6715,
+      "step": 700
+    },
+    {
+      "epoch": 0.18693333333333334,
+      "grad_norm": 0.38220258521124545,
+      "learning_rate": 0.00018737645212742475,
+      "loss": 0.7083,
+      "step": 701
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.3618802957433504,
+      "learning_rate": 0.00018733440933140126,
+      "loss": 0.7219,
+      "step": 702
+    },
+    {
+      "epoch": 0.18746666666666667,
+      "grad_norm": 0.3625799662149451,
+      "learning_rate": 0.00018729230137276285,
+      "loss": 0.6565,
+      "step": 703
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.38285072803778125,
+      "learning_rate": 0.0001872501282829275,
+      "loss": 0.668,
+      "step": 704
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.36021955278262335,
+      "learning_rate": 0.00018720789009336167,
+      "loss": 0.6385,
+      "step": 705
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.3768393533352249,
+      "learning_rate": 0.00018716558683558044,
+      "loss": 0.705,
+      "step": 706
+    },
+    {
+      "epoch": 0.18853333333333333,
+      "grad_norm": 0.37254652608248173,
+      "learning_rate": 0.00018712321854114748,
+      "loss": 0.6633,
+      "step": 707
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.36355687961053096,
+      "learning_rate": 0.00018708078524167488,
+      "loss": 0.6509,
+      "step": 708
+    },
+    {
+      "epoch": 0.18906666666666666,
+      "grad_norm": 0.3706284035093737,
+      "learning_rate": 0.00018703828696882337,
+      "loss": 0.6912,
+      "step": 709
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.40500471249350956,
+      "learning_rate": 0.00018699572375430206,
+      "loss": 0.6926,
+      "step": 710
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.3908052292451713,
+      "learning_rate": 0.0001869530956298685,
+      "loss": 0.719,
+      "step": 711
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.3741685035500202,
+      "learning_rate": 0.0001869104026273288,
+      "loss": 0.6612,
+      "step": 712
+    },
+    {
+      "epoch": 0.19013333333333332,
+      "grad_norm": 0.371074216203983,
+      "learning_rate": 0.00018686764477853724,
+      "loss": 0.7273,
+      "step": 713
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.36509941624145703,
+      "learning_rate": 0.00018682482211539677,
+      "loss": 0.6396,
+      "step": 714
+    },
+    {
+      "epoch": 0.19066666666666668,
+      "grad_norm": 0.3497839006663842,
+      "learning_rate": 0.0001867819346698585,
+      "loss": 0.6242,
+      "step": 715
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.3537148420688285,
+      "learning_rate": 0.00018673898247392197,
+      "loss": 0.6216,
+      "step": 716
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.38854314638329096,
+      "learning_rate": 0.00018669596555963497,
+      "loss": 0.6639,
+      "step": 717
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.38526262005188155,
+      "learning_rate": 0.00018665288395909363,
+      "loss": 0.6941,
+      "step": 718
+    },
+    {
+      "epoch": 0.19173333333333334,
+      "grad_norm": 0.3909003225026855,
+      "learning_rate": 0.00018660973770444228,
+      "loss": 0.7138,
+      "step": 719
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.35323993123136765,
+      "learning_rate": 0.00018656652682787358,
+      "loss": 0.6824,
+      "step": 720
+    },
+    {
+      "epoch": 0.19226666666666667,
+      "grad_norm": 0.37616707591461246,
+      "learning_rate": 0.00018652325136162833,
+      "loss": 0.6643,
+      "step": 721
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.3771200028723444,
+      "learning_rate": 0.00018647991133799558,
+      "loss": 0.7281,
+      "step": 722
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.39027530921813985,
+      "learning_rate": 0.00018643650678931248,
+      "loss": 0.6906,
+      "step": 723
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.37809192544625997,
+      "learning_rate": 0.0001863930377479644,
+      "loss": 0.7354,
+      "step": 724
+    },
+    {
+      "epoch": 0.19333333333333333,
+      "grad_norm": 0.3799375259063448,
+      "learning_rate": 0.0001863495042463848,
+      "loss": 0.7017,
+      "step": 725
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.3779825934739253,
+      "learning_rate": 0.00018630590631705512,
+      "loss": 0.6667,
+      "step": 726
+    },
+    {
+      "epoch": 0.19386666666666666,
+      "grad_norm": 0.3846767603058412,
+      "learning_rate": 0.00018626224399250513,
+      "loss": 0.7023,
+      "step": 727
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.38845373616066514,
+      "learning_rate": 0.00018621851730531242,
+      "loss": 0.672,
+      "step": 728
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.3752092260227272,
+      "learning_rate": 0.00018617472628810268,
+      "loss": 0.712,
+      "step": 729
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.3743270364949635,
+      "learning_rate": 0.00018613087097354958,
+      "loss": 0.6662,
+      "step": 730
+    },
+    {
+      "epoch": 0.19493333333333332,
+      "grad_norm": 0.358376110306125,
+      "learning_rate": 0.00018608695139437487,
+      "loss": 0.6291,
+      "step": 731
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.36722400631538576,
+      "learning_rate": 0.00018604296758334803,
+      "loss": 0.6954,
+      "step": 732
+    },
+    {
+      "epoch": 0.19546666666666668,
+      "grad_norm": 0.3794407568863177,
+      "learning_rate": 0.0001859989195732867,
+      "loss": 0.7316,
+      "step": 733
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.3738374961196488,
+      "learning_rate": 0.00018595480739705628,
+      "loss": 0.6928,
+      "step": 734
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.3742656591480173,
+      "learning_rate": 0.00018591063108757007,
+      "loss": 0.7018,
+      "step": 735
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.41126781845681987,
+      "learning_rate": 0.00018586639067778924,
+      "loss": 0.6897,
+      "step": 736
+    },
+    {
+      "epoch": 0.19653333333333334,
+      "grad_norm": 0.3742048996635612,
+      "learning_rate": 0.0001858220862007228,
+      "loss": 0.6677,
+      "step": 737
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.39672258470120586,
+      "learning_rate": 0.00018577771768942753,
+      "loss": 0.7297,
+      "step": 738
+    },
+    {
+      "epoch": 0.19706666666666667,
+      "grad_norm": 0.35983278629271476,
+      "learning_rate": 0.00018573328517700803,
+      "loss": 0.6868,
+      "step": 739
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.360530906892004,
+      "learning_rate": 0.00018568878869661658,
+      "loss": 0.7297,
+      "step": 740
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.37679000714248617,
+      "learning_rate": 0.00018564422828145326,
+      "loss": 0.6783,
+      "step": 741
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.38135885735408154,
+      "learning_rate": 0.00018559960396476578,
+      "loss": 0.6755,
+      "step": 742
+    },
+    {
+      "epoch": 0.19813333333333333,
+      "grad_norm": 0.3676255502336289,
+      "learning_rate": 0.00018555491577984968,
+      "loss": 0.6701,
+      "step": 743
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.37092160190801005,
+      "learning_rate": 0.00018551016376004795,
+      "loss": 0.652,
+      "step": 744
+    },
+    {
+      "epoch": 0.19866666666666666,
+      "grad_norm": 0.37617860948393356,
+      "learning_rate": 0.00018546534793875132,
+      "loss": 0.6609,
+      "step": 745
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.39949026181909875,
+      "learning_rate": 0.00018542046834939816,
+      "loss": 0.6735,
+      "step": 746
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.35456959936277177,
+      "learning_rate": 0.00018537552502547432,
+      "loss": 0.6795,
+      "step": 747
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.3547637684272837,
+      "learning_rate": 0.00018533051800051332,
+      "loss": 0.6404,
+      "step": 748
+    },
+    {
+      "epoch": 0.19973333333333335,
+      "grad_norm": 0.39532200273811835,
+      "learning_rate": 0.0001852854473080961,
+      "loss": 0.7151,
+      "step": 749
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.3594284858890217,
+      "learning_rate": 0.0001852403129818511,
+      "loss": 0.6601,
+      "step": 750
+    },
+    {
+      "epoch": 0.20026666666666668,
+      "grad_norm": 0.3802212011617842,
+      "learning_rate": 0.0001851951150554544,
+      "loss": 0.7347,
+      "step": 751
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.3616455410835533,
+      "learning_rate": 0.00018514985356262934,
+      "loss": 0.6856,
+      "step": 752
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.37844405621281424,
+      "learning_rate": 0.00018510452853714678,
+      "loss": 0.6433,
+      "step": 753
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.3650231986573298,
+      "learning_rate": 0.000185059140012825,
+      "loss": 0.6539,
+      "step": 754
+    },
+    {
+      "epoch": 0.20133333333333334,
+      "grad_norm": 0.3671252126818593,
+      "learning_rate": 0.00018501368802352957,
+      "loss": 0.7281,
+      "step": 755
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3563889043200433,
+      "learning_rate": 0.0001849681726031736,
+      "loss": 0.6583,
+      "step": 756
+    },
+    {
+      "epoch": 0.20186666666666667,
+      "grad_norm": 0.35200170683064536,
+      "learning_rate": 0.00018492259378571725,
+      "loss": 0.6707,
+      "step": 757
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.3636122431240546,
+      "learning_rate": 0.00018487695160516825,
+      "loss": 0.711,
+      "step": 758
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.36439355090291375,
+      "learning_rate": 0.00018483124609558143,
+      "loss": 0.6849,
+      "step": 759
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.3735234618741416,
+      "learning_rate": 0.00018478547729105897,
+      "loss": 0.6558,
+      "step": 760
+    },
+    {
+      "epoch": 0.20293333333333333,
+      "grad_norm": 0.3621322005276756,
+      "learning_rate": 0.0001847396452257502,
+      "loss": 0.679,
+      "step": 761
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.3663733156867625,
+      "learning_rate": 0.00018469374993385174,
+      "loss": 0.6962,
+      "step": 762
+    },
+    {
+      "epoch": 0.20346666666666666,
+      "grad_norm": 0.38410871208592184,
+      "learning_rate": 0.00018464779144960726,
+      "loss": 0.7099,
+      "step": 763
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.3791658521215643,
+      "learning_rate": 0.00018460176980730775,
+      "loss": 0.7021,
+      "step": 764
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.35880046589803916,
+      "learning_rate": 0.00018455568504129115,
+      "loss": 0.6467,
+      "step": 765
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.3505270740068604,
+      "learning_rate": 0.0001845095371859426,
+      "loss": 0.6661,
+      "step": 766
+    },
+    {
+      "epoch": 0.20453333333333334,
+      "grad_norm": 0.3620142085417268,
+      "learning_rate": 0.0001844633262756943,
+      "loss": 0.7191,
+      "step": 767
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3864279961960631,
+      "learning_rate": 0.00018441705234502548,
+      "loss": 0.6899,
+      "step": 768
+    },
+    {
+      "epoch": 0.20506666666666667,
+      "grad_norm": 0.37482766104381093,
+      "learning_rate": 0.0001843707154284624,
+      "loss": 0.6976,
+      "step": 769
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.38529960394893153,
+      "learning_rate": 0.00018432431556057832,
+      "loss": 0.7183,
+      "step": 770
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.36391481747905546,
+      "learning_rate": 0.00018427785277599345,
+      "loss": 0.7219,
+      "step": 771
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.380908481889315,
+      "learning_rate": 0.00018423132710937497,
+      "loss": 0.7165,
+      "step": 772
+    },
+    {
+      "epoch": 0.20613333333333334,
+      "grad_norm": 0.3779389305458948,
+      "learning_rate": 0.00018418473859543695,
+      "loss": 0.7051,
+      "step": 773
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.36318923582328305,
+      "learning_rate": 0.00018413808726894037,
+      "loss": 0.6515,
+      "step": 774
+    },
+    {
+      "epoch": 0.20666666666666667,
+      "grad_norm": 0.365308687784887,
+      "learning_rate": 0.00018409137316469307,
+      "loss": 0.6767,
+      "step": 775
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.3626020322486663,
+      "learning_rate": 0.00018404459631754974,
+      "loss": 0.6629,
+      "step": 776
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.3491393349155495,
+      "learning_rate": 0.0001839977567624119,
+      "loss": 0.632,
+      "step": 777
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.3790090586840057,
+      "learning_rate": 0.0001839508545342278,
+      "loss": 0.6615,
+      "step": 778
+    },
+    {
+      "epoch": 0.20773333333333333,
+      "grad_norm": 0.37875087514028966,
+      "learning_rate": 0.0001839038896679925,
+      "loss": 0.6749,
+      "step": 779
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.37912120811174993,
+      "learning_rate": 0.0001838568621987478,
+      "loss": 0.6676,
+      "step": 780
+    },
+    {
+      "epoch": 0.20826666666666666,
+      "grad_norm": 0.3783780332693869,
+      "learning_rate": 0.00018380977216158215,
+      "loss": 0.6592,
+      "step": 781
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.35583651903348806,
+      "learning_rate": 0.00018376261959163076,
+      "loss": 0.6626,
+      "step": 782
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.36447921870025685,
+      "learning_rate": 0.00018371540452407546,
+      "loss": 0.7059,
+      "step": 783
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.37712528689170866,
+      "learning_rate": 0.00018366812699414475,
+      "loss": 0.6725,
+      "step": 784
+    },
+    {
+      "epoch": 0.20933333333333334,
+      "grad_norm": 0.3613627845284323,
+      "learning_rate": 0.00018362078703711366,
+      "loss": 0.6447,
+      "step": 785
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.36877410930130333,
+      "learning_rate": 0.0001835733846883038,
+      "loss": 0.6449,
+      "step": 786
+    },
+    {
+      "epoch": 0.20986666666666667,
+      "grad_norm": 0.3569084003684999,
+      "learning_rate": 0.00018352591998308345,
+      "loss": 0.6077,
+      "step": 787
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.37035467939568206,
+      "learning_rate": 0.00018347839295686732,
+      "loss": 0.6931,
+      "step": 788
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.4034614433225208,
+      "learning_rate": 0.0001834308036451166,
+      "loss": 0.7177,
+      "step": 789
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.37777579908281395,
+      "learning_rate": 0.00018338315208333902,
+      "loss": 0.669,
+      "step": 790
+    },
+    {
+      "epoch": 0.21093333333333333,
+      "grad_norm": 0.37091469516607756,
+      "learning_rate": 0.00018333543830708872,
+      "loss": 0.6635,
+      "step": 791
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3619506344302214,
+      "learning_rate": 0.0001832876623519663,
+      "loss": 0.6571,
+      "step": 792
+    },
+    {
+      "epoch": 0.21146666666666666,
+      "grad_norm": 0.3913724968303028,
+      "learning_rate": 0.00018323982425361862,
+      "loss": 0.6397,
+      "step": 793
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.35941798327326463,
+      "learning_rate": 0.0001831919240477391,
+      "loss": 0.6512,
+      "step": 794
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.364866497442862,
+      "learning_rate": 0.00018314396177006737,
+      "loss": 0.683,
+      "step": 795
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.37662644812584084,
+      "learning_rate": 0.00018309593745638943,
+      "loss": 0.6618,
+      "step": 796
+    },
+    {
+      "epoch": 0.21253333333333332,
+      "grad_norm": 0.41794098986117456,
+      "learning_rate": 0.00018304785114253756,
+      "loss": 0.7205,
+      "step": 797
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.3565020649378219,
+      "learning_rate": 0.0001829997028643902,
+      "loss": 0.6568,
+      "step": 798
+    },
+    {
+      "epoch": 0.21306666666666665,
+      "grad_norm": 0.38708444261364383,
+      "learning_rate": 0.00018295149265787222,
+      "loss": 0.7193,
+      "step": 799
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.41124485895028245,
+      "learning_rate": 0.00018290322055895453,
+      "loss": 0.706,
+      "step": 800
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.37390625327736315,
+      "learning_rate": 0.0001828548866036543,
+      "loss": 0.7001,
+      "step": 801
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.43156584902192524,
+      "learning_rate": 0.00018280649082803478,
+      "loss": 0.6869,
+      "step": 802
+    },
+    {
+      "epoch": 0.21413333333333334,
+      "grad_norm": 0.37331145354869777,
+      "learning_rate": 0.00018275803326820545,
+      "loss": 0.6633,
+      "step": 803
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.36755237856747536,
+      "learning_rate": 0.00018270951396032179,
+      "loss": 0.7256,
+      "step": 804
+    },
+    {
+      "epoch": 0.21466666666666667,
+      "grad_norm": 0.3761698123559096,
+      "learning_rate": 0.00018266093294058542,
+      "loss": 0.7128,
+      "step": 805
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.37577680738610936,
+      "learning_rate": 0.000182612290245244,
+      "loss": 0.6561,
+      "step": 806
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.3546255870164141,
+      "learning_rate": 0.00018256358591059116,
+      "loss": 0.68,
+      "step": 807
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4004601400429358,
+      "learning_rate": 0.00018251481997296653,
+      "loss": 0.6453,
+      "step": 808
+    },
+    {
+      "epoch": 0.21573333333333333,
+      "grad_norm": 0.3864264432170919,
+      "learning_rate": 0.0001824659924687558,
+      "loss": 0.6829,
+      "step": 809
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.3731103874980662,
+      "learning_rate": 0.00018241710343439043,
+      "loss": 0.7011,
+      "step": 810
+    },
+    {
+      "epoch": 0.21626666666666666,
+      "grad_norm": 0.39402952342028597,
+      "learning_rate": 0.00018236815290634796,
+      "loss": 0.73,
+      "step": 811
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.38064391724654933,
+      "learning_rate": 0.00018231914092115163,
+      "loss": 0.6975,
+      "step": 812
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.3454030876230318,
+      "learning_rate": 0.0001822700675153707,
+      "loss": 0.6306,
+      "step": 813
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.3757030862071576,
+      "learning_rate": 0.0001822209327256202,
+      "loss": 0.6193,
+      "step": 814
+    },
+    {
+      "epoch": 0.21733333333333332,
+      "grad_norm": 0.34305496517409284,
+      "learning_rate": 0.00018217173658856097,
+      "loss": 0.6308,
+      "step": 815
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.37111523510124256,
+      "learning_rate": 0.00018212247914089954,
+      "loss": 0.6539,
+      "step": 816
+    },
+    {
+      "epoch": 0.21786666666666665,
+      "grad_norm": 0.3787404183646755,
+      "learning_rate": 0.00018207316041938832,
+      "loss": 0.6763,
+      "step": 817
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.38946742059341627,
+      "learning_rate": 0.00018202378046082532,
+      "loss": 0.6883,
+      "step": 818
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.3771883394030353,
+      "learning_rate": 0.0001819743393020543,
+      "loss": 0.6729,
+      "step": 819
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.3669827885293365,
+      "learning_rate": 0.00018192483697996472,
+      "loss": 0.705,
+      "step": 820
+    },
+    {
+      "epoch": 0.21893333333333334,
+      "grad_norm": 0.38744728666382117,
+      "learning_rate": 0.00018187527353149158,
+      "loss": 0.676,
+      "step": 821
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.3702189972702748,
+      "learning_rate": 0.00018182564899361556,
+      "loss": 0.6725,
+      "step": 822
+    },
+    {
+      "epoch": 0.21946666666666667,
+      "grad_norm": 0.37497364351640805,
+      "learning_rate": 0.00018177596340336288,
+      "loss": 0.6539,
+      "step": 823
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.3937422659286144,
+      "learning_rate": 0.00018172621679780532,
+      "loss": 0.7364,
+      "step": 824
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.3575934371504687,
+      "learning_rate": 0.00018167640921406023,
+      "loss": 0.7327,
+      "step": 825
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.3672174923553296,
+      "learning_rate": 0.00018162654068929043,
+      "loss": 0.7038,
+      "step": 826
+    },
+    {
+      "epoch": 0.22053333333333333,
+      "grad_norm": 0.4161839683300949,
+      "learning_rate": 0.0001815766112607042,
+      "loss": 0.7,
+      "step": 827
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3706453113933822,
+      "learning_rate": 0.0001815266209655552,
+      "loss": 0.6571,
+      "step": 828
+    },
+    {
+      "epoch": 0.22106666666666666,
+      "grad_norm": 0.3670205544473324,
+      "learning_rate": 0.00018147656984114266,
+      "loss": 0.6754,
+      "step": 829
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.3789810792810146,
+      "learning_rate": 0.00018142645792481107,
+      "loss": 0.6946,
+      "step": 830
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.40461845415517067,
+      "learning_rate": 0.0001813762852539503,
+      "loss": 0.6543,
+      "step": 831
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.38411984430369106,
+      "learning_rate": 0.0001813260518659956,
+      "loss": 0.6681,
+      "step": 832
+    },
+    {
+      "epoch": 0.22213333333333332,
+      "grad_norm": 0.40885506891758255,
+      "learning_rate": 0.00018127575779842742,
+      "loss": 0.6045,
+      "step": 833
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.37170446490494097,
+      "learning_rate": 0.00018122540308877162,
+      "loss": 0.7112,
+      "step": 834
+    },
+    {
+      "epoch": 0.22266666666666668,
+      "grad_norm": 0.39703368312579296,
+      "learning_rate": 0.00018117498777459924,
+      "loss": 0.6759,
+      "step": 835
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.3767130091451189,
+      "learning_rate": 0.00018112451189352652,
+      "loss": 0.6686,
+      "step": 836
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.38993338683923956,
+      "learning_rate": 0.00018107397548321487,
+      "loss": 0.6136,
+      "step": 837
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.3855059989989872,
+      "learning_rate": 0.00018102337858137094,
+      "loss": 0.6788,
+      "step": 838
+    },
+    {
+      "epoch": 0.22373333333333334,
+      "grad_norm": 0.3803130749460877,
+      "learning_rate": 0.0001809727212257465,
+      "loss": 0.6718,
+      "step": 839
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4033073629746989,
+      "learning_rate": 0.00018092200345413837,
+      "loss": 0.6091,
+      "step": 840
+    },
+    {
+      "epoch": 0.22426666666666667,
+      "grad_norm": 0.3797057141579435,
+      "learning_rate": 0.00018087122530438846,
+      "loss": 0.6326,
+      "step": 841
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.3879992997600297,
+      "learning_rate": 0.0001808203868143838,
+      "loss": 0.7559,
+      "step": 842
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.3928624627878775,
+      "learning_rate": 0.00018076948802205636,
+      "loss": 0.7095,
+      "step": 843
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.3938380577687365,
+      "learning_rate": 0.00018071852896538315,
+      "loss": 0.6768,
+      "step": 844
+    },
+    {
+      "epoch": 0.22533333333333333,
+      "grad_norm": 0.35035889321287694,
+      "learning_rate": 0.0001806675096823861,
+      "loss": 0.6797,
+      "step": 845
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.36684177650978017,
+      "learning_rate": 0.00018061643021113216,
+      "loss": 0.7067,
+      "step": 846
+    },
+    {
+      "epoch": 0.22586666666666666,
+      "grad_norm": 0.3830949418296738,
+      "learning_rate": 0.0001805652905897331,
+      "loss": 0.7198,
+      "step": 847
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.34357360805127096,
+      "learning_rate": 0.00018051409085634556,
+      "loss": 0.6488,
+      "step": 848
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.3702133101019209,
+      "learning_rate": 0.00018046283104917118,
+      "loss": 0.6151,
+      "step": 849
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.3763682241149857,
+      "learning_rate": 0.0001804115112064562,
+      "loss": 0.657,
+      "step": 850
+    },
+    {
+      "epoch": 0.22693333333333332,
+      "grad_norm": 0.37035958266927865,
+      "learning_rate": 0.00018036013136649186,
+      "loss": 0.6699,
+      "step": 851
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3603625279827103,
+      "learning_rate": 0.00018030869156761403,
+      "loss": 0.6582,
+      "step": 852
+    },
+    {
+      "epoch": 0.22746666666666668,
+      "grad_norm": 0.3982720688740039,
+      "learning_rate": 0.0001802571918482034,
+      "loss": 0.7482,
+      "step": 853
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.3758094698939472,
+      "learning_rate": 0.00018020563224668533,
+      "loss": 0.6999,
+      "step": 854
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.38749675883304113,
+      "learning_rate": 0.00018015401280152983,
+      "loss": 0.7056,
+      "step": 855
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.3915657129303144,
+      "learning_rate": 0.00018010233355125163,
+      "loss": 0.6845,
+      "step": 856
+    },
+    {
+      "epoch": 0.22853333333333334,
+      "grad_norm": 0.37142343215143453,
+      "learning_rate": 0.00018005059453441002,
+      "loss": 0.6336,
+      "step": 857
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.3649017025082439,
+      "learning_rate": 0.00017999879578960889,
+      "loss": 0.6321,
+      "step": 858
+    },
+    {
+      "epoch": 0.22906666666666667,
+      "grad_norm": 0.41404793524551164,
+      "learning_rate": 0.00017994693735549677,
+      "loss": 0.6768,
+      "step": 859
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.37170947659867914,
+      "learning_rate": 0.00017989501927076663,
+      "loss": 0.6663,
+      "step": 860
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.3995413208923138,
+      "learning_rate": 0.00017984304157415602,
+      "loss": 0.6732,
+      "step": 861
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.37557958255479806,
+      "learning_rate": 0.0001797910043044469,
+      "loss": 0.6547,
+      "step": 862
+    },
+    {
+      "epoch": 0.23013333333333333,
+      "grad_norm": 0.3477869841404091,
+      "learning_rate": 0.00017973890750046573,
+      "loss": 0.6151,
+      "step": 863
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3775952170739762,
+      "learning_rate": 0.00017968675120108338,
+      "loss": 0.632,
+      "step": 864
+    },
+    {
+      "epoch": 0.23066666666666666,
+      "grad_norm": 0.36848209655282005,
+      "learning_rate": 0.0001796345354452151,
+      "loss": 0.677,
+      "step": 865
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.3737195218502748,
+      "learning_rate": 0.0001795822602718205,
+      "loss": 0.6683,
+      "step": 866
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.36308433478212737,
+      "learning_rate": 0.0001795299257199035,
+      "loss": 0.6572,
+      "step": 867
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.36289198132546524,
+      "learning_rate": 0.00017947753182851245,
+      "loss": 0.6729,
+      "step": 868
+    },
+    {
+      "epoch": 0.23173333333333335,
+      "grad_norm": 0.3745943466101185,
+      "learning_rate": 0.0001794250786367398,
+      "loss": 0.7091,
+      "step": 869
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3523879995591142,
+      "learning_rate": 0.00017937256618372232,
+      "loss": 0.6247,
+      "step": 870
+    },
+    {
+      "epoch": 0.23226666666666668,
+      "grad_norm": 0.3551454603054259,
+      "learning_rate": 0.00017931999450864104,
+      "loss": 0.6516,
+      "step": 871
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.345300826949314,
+      "learning_rate": 0.00017926736365072115,
+      "loss": 0.616,
+      "step": 872
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.3719750321170669,
+      "learning_rate": 0.00017921467364923193,
+      "loss": 0.7055,
+      "step": 873
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.3766349813970907,
+      "learning_rate": 0.00017916192454348688,
+      "loss": 0.6575,
+      "step": 874
+    },
+    {
+      "epoch": 0.23333333333333334,
+      "grad_norm": 0.35536148277251234,
+      "learning_rate": 0.00017910911637284357,
+      "loss": 0.698,
+      "step": 875
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3645922686346899,
+      "learning_rate": 0.00017905624917670366,
+      "loss": 0.6505,
+      "step": 876
+    },
+    {
+      "epoch": 0.23386666666666667,
+      "grad_norm": 0.3733496800144498,
+      "learning_rate": 0.00017900332299451273,
+      "loss": 0.6472,
+      "step": 877
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.3855523186295078,
+      "learning_rate": 0.00017895033786576056,
+      "loss": 0.6751,
+      "step": 878
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.38341509256938605,
+      "learning_rate": 0.0001788972938299808,
+      "loss": 0.6431,
+      "step": 879
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.37360303338418127,
+      "learning_rate": 0.00017884419092675105,
+      "loss": 0.6292,
+      "step": 880
+    },
+    {
+      "epoch": 0.23493333333333333,
+      "grad_norm": 0.42119666302347647,
+      "learning_rate": 0.00017879102919569285,
+      "loss": 0.6444,
+      "step": 881
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.36755331891950127,
+      "learning_rate": 0.00017873780867647162,
+      "loss": 0.7022,
+      "step": 882
+    },
+    {
+      "epoch": 0.23546666666666666,
+      "grad_norm": 0.35717611916899655,
+      "learning_rate": 0.00017868452940879675,
+      "loss": 0.6728,
+      "step": 883
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.36170010813011433,
+      "learning_rate": 0.00017863119143242124,
+      "loss": 0.6126,
+      "step": 884
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.4115929580021471,
+      "learning_rate": 0.00017857779478714213,
+      "loss": 0.6457,
+      "step": 885
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.35007903341304025,
+      "learning_rate": 0.0001785243395128001,
+      "loss": 0.6956,
+      "step": 886
+    },
+    {
+      "epoch": 0.23653333333333335,
+      "grad_norm": 0.3700423171601644,
+      "learning_rate": 0.00017847082564927957,
+      "loss": 0.6552,
+      "step": 887
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3838509453166939,
+      "learning_rate": 0.00017841725323650877,
+      "loss": 0.6744,
+      "step": 888
+    },
+    {
+      "epoch": 0.23706666666666668,
+      "grad_norm": 0.38170306779665314,
+      "learning_rate": 0.00017836362231445953,
+      "loss": 0.6526,
+      "step": 889
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.38212518819422253,
+      "learning_rate": 0.00017830993292314737,
+      "loss": 0.6637,
+      "step": 890
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.39266284643771737,
+      "learning_rate": 0.00017825618510263142,
+      "loss": 0.6682,
+      "step": 891
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.37057321845640795,
+      "learning_rate": 0.00017820237889301437,
+      "loss": 0.6751,
+      "step": 892
+    },
+    {
+      "epoch": 0.23813333333333334,
+      "grad_norm": 0.3525895622782891,
+      "learning_rate": 0.00017814851433444262,
+      "loss": 0.635,
+      "step": 893
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.36731955901952146,
+      "learning_rate": 0.00017809459146710593,
+      "loss": 0.6296,
+      "step": 894
+    },
+    {
+      "epoch": 0.23866666666666667,
+      "grad_norm": 0.3776104002873812,
+      "learning_rate": 0.00017804061033123767,
+      "loss": 0.7258,
+      "step": 895
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.3681502417412675,
+      "learning_rate": 0.00017798657096711464,
+      "loss": 0.6405,
+      "step": 896
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.36315112764267293,
+      "learning_rate": 0.0001779324734150571,
+      "loss": 0.6668,
+      "step": 897
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.3329708112538578,
+      "learning_rate": 0.00017787831771542872,
+      "loss": 0.6053,
+      "step": 898
+    },
+    {
+      "epoch": 0.23973333333333333,
+      "grad_norm": 0.3597972038634347,
+      "learning_rate": 0.00017782410390863662,
+      "loss": 0.6876,
+      "step": 899
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.36310599619074185,
+      "learning_rate": 0.00017776983203513113,
+      "loss": 0.6229,
+      "step": 900
+    },
+    {
+      "epoch": 0.24026666666666666,
+      "grad_norm": 0.37462854428278636,
+      "learning_rate": 0.00017771550213540607,
+      "loss": 0.7268,
+      "step": 901
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.35205708604798935,
+      "learning_rate": 0.00017766111424999842,
+      "loss": 0.6494,
+      "step": 902
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.3598438745211486,
+      "learning_rate": 0.00017760666841948856,
+      "loss": 0.7102,
+      "step": 903
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.36477586393521,
+      "learning_rate": 0.00017755216468449996,
+      "loss": 0.6396,
+      "step": 904
+    },
+    {
+      "epoch": 0.24133333333333334,
+      "grad_norm": 0.36967086452141024,
+      "learning_rate": 0.0001774976030856994,
+      "loss": 0.6866,
+      "step": 905
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.34281380487072416,
+      "learning_rate": 0.00017744298366379672,
+      "loss": 0.6625,
+      "step": 906
+    },
+    {
+      "epoch": 0.24186666666666667,
+      "grad_norm": 0.3727813585799761,
+      "learning_rate": 0.0001773883064595451,
+      "loss": 0.6521,
+      "step": 907
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.3695465129269741,
+      "learning_rate": 0.0001773335715137406,
+      "loss": 0.6337,
+      "step": 908
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.3998986207982075,
+      "learning_rate": 0.00017727877886722257,
+      "loss": 0.7152,
+      "step": 909
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.37960333241164557,
+      "learning_rate": 0.00017722392856087327,
+      "loss": 0.6853,
+      "step": 910
+    },
+    {
+      "epoch": 0.24293333333333333,
+      "grad_norm": 0.35251879527239355,
+      "learning_rate": 0.00017716902063561797,
+      "loss": 0.6539,
+      "step": 911
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.34915552878152734,
+      "learning_rate": 0.00017711405513242513,
+      "loss": 0.6582,
+      "step": 912
+    },
+    {
+      "epoch": 0.24346666666666666,
+      "grad_norm": 0.3853434496640489,
+      "learning_rate": 0.0001770590320923059,
+      "loss": 0.6455,
+      "step": 913
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.38215275427772366,
+      "learning_rate": 0.00017700395155631455,
+      "loss": 0.7082,
+      "step": 914
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.3735208392908576,
+      "learning_rate": 0.00017694881356554817,
+      "loss": 0.6519,
+      "step": 915
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.39245973372040327,
+      "learning_rate": 0.00017689361816114677,
+      "loss": 0.6559,
+      "step": 916
+    },
+    {
+      "epoch": 0.24453333333333332,
+      "grad_norm": 0.42824539539470896,
+      "learning_rate": 0.00017683836538429315,
+      "loss": 0.6914,
+      "step": 917
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.38039884993754464,
+      "learning_rate": 0.0001767830552762129,
+      "loss": 0.6935,
+      "step": 918
+    },
+    {
+      "epoch": 0.24506666666666665,
+      "grad_norm": 0.4053472220079151,
+      "learning_rate": 0.00017672768787817443,
+      "loss": 0.716,
+      "step": 919
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.3625670965985844,
+      "learning_rate": 0.00017667226323148894,
+      "loss": 0.654,
+      "step": 920
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.3846759169631727,
+      "learning_rate": 0.0001766167813775102,
+      "loss": 0.7346,
+      "step": 921
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.387823479061407,
+      "learning_rate": 0.00017656124235763485,
+      "loss": 0.6725,
+      "step": 922
+    },
+    {
+      "epoch": 0.24613333333333334,
+      "grad_norm": 0.37103577961587353,
+      "learning_rate": 0.000176505646213302,
+      "loss": 0.6387,
+      "step": 923
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.364084287388942,
+      "learning_rate": 0.00017644999298599355,
+      "loss": 0.6325,
+      "step": 924
+    },
+    {
+      "epoch": 0.24666666666666667,
+      "grad_norm": 0.3719338408947227,
+      "learning_rate": 0.00017639428271723384,
+      "loss": 0.6206,
+      "step": 925
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.5079978985352054,
+      "learning_rate": 0.00017633851544858988,
+      "loss": 0.6986,
+      "step": 926
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.3831887933477012,
+      "learning_rate": 0.00017628269122167115,
+      "loss": 0.681,
+      "step": 927
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.3862617901432955,
+      "learning_rate": 0.00017622681007812963,
+      "loss": 0.6786,
+      "step": 928
+    },
+    {
+      "epoch": 0.24773333333333333,
+      "grad_norm": 0.40196095507069857,
+      "learning_rate": 0.00017617087205965985,
+      "loss": 0.728,
+      "step": 929
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.36036525174309214,
+      "learning_rate": 0.00017611487720799865,
+      "loss": 0.662,
+      "step": 930
+    },
+    {
+      "epoch": 0.24826666666666666,
+      "grad_norm": 0.3450622496118081,
+      "learning_rate": 0.00017605882556492536,
+      "loss": 0.6656,
+      "step": 931
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.3827344319113975,
+      "learning_rate": 0.00017600271717226165,
+      "loss": 0.6747,
+      "step": 932
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.35548505635722083,
+      "learning_rate": 0.00017594655207187157,
+      "loss": 0.6792,
+      "step": 933
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.35897227580611457,
+      "learning_rate": 0.0001758903303056614,
+      "loss": 0.6459,
+      "step": 934
+    },
+    {
+      "epoch": 0.24933333333333332,
+      "grad_norm": 0.39868659999654915,
+      "learning_rate": 0.0001758340519155798,
+      "loss": 0.6946,
+      "step": 935
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3733448716299448,
+      "learning_rate": 0.0001757777169436176,
+      "loss": 0.708,
+      "step": 936
+    },
+    {
+      "epoch": 0.24986666666666665,
+      "grad_norm": 0.35651216220935267,
+      "learning_rate": 0.00017572132543180788,
+      "loss": 0.6846,
+      "step": 937
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.382658548425777,
+      "learning_rate": 0.00017566487742222596,
+      "loss": 0.6739,
+      "step": 938
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.36169049365474176,
+      "learning_rate": 0.00017560837295698916,
+      "loss": 0.6289,
+      "step": 939
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.3713348766937921,
+      "learning_rate": 0.0001755518120782571,
+      "loss": 0.6972,
+      "step": 940
+    },
+    {
+      "epoch": 0.25093333333333334,
+      "grad_norm": 0.37561670810196857,
+      "learning_rate": 0.0001754951948282314,
+      "loss": 0.6828,
+      "step": 941
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.38705806321579894,
+      "learning_rate": 0.0001754385212491557,
+      "loss": 0.7119,
+      "step": 942
+    },
+    {
+      "epoch": 0.25146666666666667,
+      "grad_norm": 0.39562921430988984,
+      "learning_rate": 0.00017538179138331582,
+      "loss": 0.6651,
+      "step": 943
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.41192107043951953,
+      "learning_rate": 0.00017532500527303938,
+      "loss": 0.6803,
+      "step": 944
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.3858319014901234,
+      "learning_rate": 0.00017526816296069614,
+      "loss": 0.7049,
+      "step": 945
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.37856325845192396,
+      "learning_rate": 0.00017521126448869772,
+      "loss": 0.6454,
+      "step": 946
+    },
+    {
+      "epoch": 0.25253333333333333,
+      "grad_norm": 0.42287906762165467,
+      "learning_rate": 0.00017515430989949754,
+      "loss": 0.6724,
+      "step": 947
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.38701881331527244,
+      "learning_rate": 0.00017509729923559112,
+      "loss": 0.661,
+      "step": 948
+    },
+    {
+      "epoch": 0.25306666666666666,
+      "grad_norm": 0.382028694687477,
+      "learning_rate": 0.00017504023253951562,
+      "loss": 0.6603,
+      "step": 949
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.35823239469334595,
+      "learning_rate": 0.00017498310985385008,
+      "loss": 0.6681,
+      "step": 950
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.3657986186198803,
+      "learning_rate": 0.00017492593122121536,
+      "loss": 0.6195,
+      "step": 951
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.36008755659015096,
+      "learning_rate": 0.00017486869668427394,
+      "loss": 0.6287,
+      "step": 952
+    },
+    {
+      "epoch": 0.2541333333333333,
+      "grad_norm": 0.36541505898332444,
+      "learning_rate": 0.00017481140628573016,
+      "loss": 0.6741,
+      "step": 953
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.37576726204518385,
+      "learning_rate": 0.00017475406006832995,
+      "loss": 0.7042,
+      "step": 954
+    },
+    {
+      "epoch": 0.25466666666666665,
+      "grad_norm": 0.37805162720487145,
+      "learning_rate": 0.0001746966580748609,
+      "loss": 0.6494,
+      "step": 955
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.3522710023320336,
+      "learning_rate": 0.00017463920034815216,
+      "loss": 0.6649,
+      "step": 956
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.3560015561765506,
+      "learning_rate": 0.00017458168693107465,
+      "loss": 0.6844,
+      "step": 957
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.4218857153029114,
+      "learning_rate": 0.00017452411786654062,
+      "loss": 0.7998,
+      "step": 958
+    },
+    {
+      "epoch": 0.2557333333333333,
+      "grad_norm": 0.36335816278009564,
+      "learning_rate": 0.00017446649319750402,
+      "loss": 0.6404,
+      "step": 959
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.34846999568866915,
+      "learning_rate": 0.0001744088129669601,
+      "loss": 0.662,
+      "step": 960
+    },
+    {
+      "epoch": 0.25626666666666664,
+      "grad_norm": 0.349520338294158,
+      "learning_rate": 0.00017435107721794577,
+      "loss": 0.6463,
+      "step": 961
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.38063931231201026,
+      "learning_rate": 0.00017429328599353924,
+      "loss": 0.6837,
+      "step": 962
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.3737308226896675,
+      "learning_rate": 0.00017423543933686012,
+      "loss": 0.7435,
+      "step": 963
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.3650735125260079,
+      "learning_rate": 0.0001741775372910694,
+      "loss": 0.6597,
+      "step": 964
+    },
+    {
+      "epoch": 0.25733333333333336,
+      "grad_norm": 0.3398711396353875,
+      "learning_rate": 0.00017411957989936941,
+      "loss": 0.6613,
+      "step": 965
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.37537336653979186,
+      "learning_rate": 0.00017406156720500376,
+      "loss": 0.6664,
+      "step": 966
+    },
+    {
+      "epoch": 0.2578666666666667,
+      "grad_norm": 0.37882826754476945,
+      "learning_rate": 0.00017400349925125733,
+      "loss": 0.6795,
+      "step": 967
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.37987112452770816,
+      "learning_rate": 0.0001739453760814562,
+      "loss": 0.6681,
+      "step": 968
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.3956517038538305,
+      "learning_rate": 0.00017388719773896768,
+      "loss": 0.6895,
+      "step": 969
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.38632946836606147,
+      "learning_rate": 0.00017382896426720024,
+      "loss": 0.6631,
+      "step": 970
+    },
+    {
+      "epoch": 0.25893333333333335,
+      "grad_norm": 0.36795667973267104,
+      "learning_rate": 0.00017377067570960352,
+      "loss": 0.6551,
+      "step": 971
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3827397268368528,
+      "learning_rate": 0.00017371233210966816,
+      "loss": 0.6936,
+      "step": 972
+    },
+    {
+      "epoch": 0.2594666666666667,
+      "grad_norm": 0.47607575702912713,
+      "learning_rate": 0.00017365393351092596,
+      "loss": 0.6075,
+      "step": 973
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.36037312736055466,
+      "learning_rate": 0.00017359547995694975,
+      "loss": 0.6494,
+      "step": 974
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.35789956483448954,
+      "learning_rate": 0.00017353697149135325,
+      "loss": 0.6267,
+      "step": 975
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.36342931327877476,
+      "learning_rate": 0.00017347840815779136,
+      "loss": 0.7116,
+      "step": 976
+    },
+    {
+      "epoch": 0.26053333333333334,
+      "grad_norm": 0.36287677779910565,
+      "learning_rate": 0.00017341978999995975,
+      "loss": 0.6531,
+      "step": 977
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.34975457415301003,
+      "learning_rate": 0.00017336111706159506,
+      "loss": 0.6385,
+      "step": 978
+    },
+    {
+      "epoch": 0.26106666666666667,
+      "grad_norm": 0.36562474091398467,
+      "learning_rate": 0.00017330238938647474,
+      "loss": 0.6516,
+      "step": 979
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.3539378322118544,
+      "learning_rate": 0.00017324360701841717,
+      "loss": 0.6435,
+      "step": 980
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.34563315328494715,
+      "learning_rate": 0.00017318477000128151,
+      "loss": 0.6748,
+      "step": 981
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.3841838483389478,
+      "learning_rate": 0.00017312587837896767,
+      "loss": 0.6784,
+      "step": 982
+    },
+    {
+      "epoch": 0.26213333333333333,
+      "grad_norm": 0.3615563485319042,
+      "learning_rate": 0.00017306693219541633,
+      "loss": 0.6782,
+      "step": 983
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5406080241399385,
+      "learning_rate": 0.00017300793149460883,
+      "loss": 0.6646,
+      "step": 984
+    },
+    {
+      "epoch": 0.26266666666666666,
+      "grad_norm": 0.34457375526611966,
+      "learning_rate": 0.00017294887632056724,
+      "loss": 0.6545,
+      "step": 985
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.35865336747608817,
+      "learning_rate": 0.00017288976671735426,
+      "loss": 0.6534,
+      "step": 986
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.37085132674766746,
+      "learning_rate": 0.0001728306027290732,
+      "loss": 0.632,
+      "step": 987
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.36442084375889044,
+      "learning_rate": 0.00017277138439986795,
+      "loss": 0.6495,
+      "step": 988
+    },
+    {
+      "epoch": 0.2637333333333333,
+      "grad_norm": 0.384949504867074,
+      "learning_rate": 0.00017271211177392296,
+      "loss": 0.6647,
+      "step": 989
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.37650754017425686,
+      "learning_rate": 0.00017265278489546308,
+      "loss": 0.6876,
+      "step": 990
+    },
+    {
+      "epoch": 0.26426666666666665,
+      "grad_norm": 0.37880256580782695,
+      "learning_rate": 0.00017259340380875384,
+      "loss": 0.6402,
+      "step": 991
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.37368116886725655,
+      "learning_rate": 0.00017253396855810107,
+      "loss": 0.6552,
+      "step": 992
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.39662226242597787,
+      "learning_rate": 0.00017247447918785104,
+      "loss": 0.6766,
+      "step": 993
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.38645991795238954,
+      "learning_rate": 0.0001724149357423904,
+      "loss": 0.6795,
+      "step": 994
+    },
+    {
+      "epoch": 0.2653333333333333,
+      "grad_norm": 0.3769234981738238,
+      "learning_rate": 0.0001723553382661462,
+      "loss": 0.6751,
+      "step": 995
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.35753086326356054,
+      "learning_rate": 0.00017229568680358575,
+      "loss": 0.6677,
+      "step": 996
+    },
+    {
+      "epoch": 0.26586666666666664,
+      "grad_norm": 0.3872739276548074,
+      "learning_rate": 0.00017223598139921666,
+      "loss": 0.7158,
+      "step": 997
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.3460464919380646,
+      "learning_rate": 0.00017217622209758675,
+      "loss": 0.6815,
+      "step": 998
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.3732913527461127,
+      "learning_rate": 0.00017211640894328412,
+      "loss": 0.7025,
+      "step": 999
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.3753496382940533,
+      "learning_rate": 0.00017205654198093696,
+      "loss": 0.702,
+      "step": 1000
+    },
+    {
+      "epoch": 0.26693333333333336,
+      "grad_norm": 0.35997945440192164,
+      "learning_rate": 0.00017199662125521377,
+      "loss": 0.6728,
+      "step": 1001
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.3574497641196626,
+      "learning_rate": 0.00017193664681082295,
+      "loss": 0.6552,
+      "step": 1002
+    },
+    {
+      "epoch": 0.2674666666666667,
+      "grad_norm": 0.36065811987352553,
+      "learning_rate": 0.00017187661869251313,
+      "loss": 0.6631,
+      "step": 1003
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.3691275127306625,
+      "learning_rate": 0.00017181653694507297,
+      "loss": 0.6415,
+      "step": 1004
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.3862848020475319,
+      "learning_rate": 0.0001717564016133311,
+      "loss": 0.6973,
+      "step": 1005
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.36563631397392443,
+      "learning_rate": 0.00017169621274215613,
+      "loss": 0.6213,
+      "step": 1006
+    },
+    {
+      "epoch": 0.26853333333333335,
+      "grad_norm": 0.37070552499562903,
+      "learning_rate": 0.00017163597037645666,
+      "loss": 0.6463,
+      "step": 1007
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.38403109410280967,
+      "learning_rate": 0.00017157567456118123,
+      "loss": 0.7268,
+      "step": 1008
+    },
+    {
+      "epoch": 0.2690666666666667,
+      "grad_norm": 0.3930852881835335,
+      "learning_rate": 0.0001715153253413181,
+      "loss": 0.7423,
+      "step": 1009
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.36070837973069164,
+      "learning_rate": 0.00017145492276189562,
+      "loss": 0.6704,
+      "step": 1010
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.3712880317217717,
+      "learning_rate": 0.00017139446686798175,
+      "loss": 0.6619,
+      "step": 1011
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.38815807584402684,
+      "learning_rate": 0.0001713339577046843,
+      "loss": 0.6804,
+      "step": 1012
+    },
+    {
+      "epoch": 0.27013333333333334,
+      "grad_norm": 0.37497177446629665,
+      "learning_rate": 0.00017127339531715084,
+      "loss": 0.6666,
+      "step": 1013
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.36114355099659295,
+      "learning_rate": 0.00017121277975056865,
+      "loss": 0.6141,
+      "step": 1014
+    },
+    {
+      "epoch": 0.27066666666666667,
+      "grad_norm": 0.3683652498766256,
+      "learning_rate": 0.00017115211105016463,
+      "loss": 0.673,
+      "step": 1015
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.39361544308351054,
+      "learning_rate": 0.00017109138926120547,
+      "loss": 0.6774,
+      "step": 1016
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.3701280177297978,
+      "learning_rate": 0.00017103061442899729,
+      "loss": 0.7574,
+      "step": 1017
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.36235762561339424,
+      "learning_rate": 0.00017096978659888586,
+      "loss": 0.6059,
+      "step": 1018
+    },
+    {
+      "epoch": 0.2717333333333333,
+      "grad_norm": 0.40450227155709073,
+      "learning_rate": 0.0001709089058162566,
+      "loss": 0.6639,
+      "step": 1019
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3709054607836301,
+      "learning_rate": 0.00017084797212653429,
+      "loss": 0.6022,
+      "step": 1020
+    },
+    {
+      "epoch": 0.27226666666666666,
+      "grad_norm": 0.3616505454176225,
+      "learning_rate": 0.00017078698557518318,
+      "loss": 0.6562,
+      "step": 1021
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.3639892418035812,
+      "learning_rate": 0.0001707259462077071,
+      "loss": 0.6463,
+      "step": 1022
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.36599974791078516,
+      "learning_rate": 0.00017066485406964915,
+      "loss": 0.6535,
+      "step": 1023
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.4085410536846461,
+      "learning_rate": 0.0001706037092065919,
+      "loss": 0.7248,
+      "step": 1024
+    },
+    {
+      "epoch": 0.2733333333333333,
+      "grad_norm": 0.37112209185505324,
+      "learning_rate": 0.00017054251166415726,
+      "loss": 0.6836,
+      "step": 1025
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.3714015553228088,
+      "learning_rate": 0.00017048126148800635,
+      "loss": 0.6857,
+      "step": 1026
+    },
+    {
+      "epoch": 0.27386666666666665,
+      "grad_norm": 0.36393290295019776,
+      "learning_rate": 0.0001704199587238396,
+      "loss": 0.7034,
+      "step": 1027
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.37032237548237024,
+      "learning_rate": 0.00017035860341739674,
+      "loss": 0.669,
+      "step": 1028
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.3847663769376323,
+      "learning_rate": 0.00017029719561445665,
+      "loss": 0.7103,
+      "step": 1029
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.36727470895649217,
+      "learning_rate": 0.00017023573536083735,
+      "loss": 0.6926,
+      "step": 1030
+    },
+    {
+      "epoch": 0.2749333333333333,
+      "grad_norm": 0.4119377769662957,
+      "learning_rate": 0.0001701742227023961,
+      "loss": 0.6666,
+      "step": 1031
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3850447201755565,
+      "learning_rate": 0.00017011265768502912,
+      "loss": 0.6668,
+      "step": 1032
+    },
+    {
+      "epoch": 0.2754666666666667,
+      "grad_norm": 0.36228529837308865,
+      "learning_rate": 0.00017005104035467184,
+      "loss": 0.6372,
+      "step": 1033
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.3665351766490015,
+      "learning_rate": 0.0001699893707572986,
+      "loss": 0.6906,
+      "step": 1034
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.37174418114514696,
+      "learning_rate": 0.0001699276489389228,
+      "loss": 0.7004,
+      "step": 1035
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.3934583187983519,
+      "learning_rate": 0.00016986587494559682,
+      "loss": 0.6933,
+      "step": 1036
+    },
+    {
+      "epoch": 0.27653333333333335,
+      "grad_norm": 0.3703166739240665,
+      "learning_rate": 0.0001698040488234119,
+      "loss": 0.705,
+      "step": 1037
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.3744401150164368,
+      "learning_rate": 0.0001697421706184983,
+      "loss": 0.6827,
+      "step": 1038
+    },
+    {
+      "epoch": 0.2770666666666667,
+      "grad_norm": 0.3658276342156474,
+      "learning_rate": 0.00016968024037702491,
+      "loss": 0.7223,
+      "step": 1039
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.35985253347044355,
+      "learning_rate": 0.00016961825814519976,
+      "loss": 0.6944,
+      "step": 1040
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.36943852949361683,
+      "learning_rate": 0.0001695562239692694,
+      "loss": 0.6679,
+      "step": 1041
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.36021617629421065,
+      "learning_rate": 0.00016949413789551924,
+      "loss": 0.6426,
+      "step": 1042
+    },
+    {
+      "epoch": 0.27813333333333334,
+      "grad_norm": 0.3927581675847467,
+      "learning_rate": 0.0001694319999702735,
+      "loss": 0.6149,
+      "step": 1043
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.38415736834236625,
+      "learning_rate": 0.0001693698102398949,
+      "loss": 0.6353,
+      "step": 1044
+    },
+    {
+      "epoch": 0.2786666666666667,
+      "grad_norm": 0.40727383142638085,
+      "learning_rate": 0.00016930756875078496,
+      "loss": 0.6988,
+      "step": 1045
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.36211847481910026,
+      "learning_rate": 0.00016924527554938382,
+      "loss": 0.641,
+      "step": 1046
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.3924335682331854,
+      "learning_rate": 0.0001691829306821701,
+      "loss": 0.6694,
+      "step": 1047
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.3557572253638986,
+      "learning_rate": 0.000169120534195661,
+      "loss": 0.6477,
+      "step": 1048
+    },
+    {
+      "epoch": 0.27973333333333333,
+      "grad_norm": 0.3576945514579245,
+      "learning_rate": 0.00016905808613641235,
+      "loss": 0.6571,
+      "step": 1049
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3432926613371049,
+      "learning_rate": 0.0001689955865510183,
+      "loss": 0.6387,
+      "step": 1050
+    },
+    {
+      "epoch": 0.28026666666666666,
+      "grad_norm": 0.36141449663437264,
+      "learning_rate": 0.00016893303548611152,
+      "loss": 0.6742,
+      "step": 1051
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.35812745513977534,
+      "learning_rate": 0.00016887043298836316,
+      "loss": 0.6633,
+      "step": 1052
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.3581035720758667,
+      "learning_rate": 0.0001688077791044826,
+      "loss": 0.6878,
+      "step": 1053
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.37736573683962743,
+      "learning_rate": 0.00016874507388121764,
+      "loss": 0.7029,
+      "step": 1054
+    },
+    {
+      "epoch": 0.2813333333333333,
+      "grad_norm": 0.36786599115150365,
+      "learning_rate": 0.0001686823173653544,
+      "loss": 0.699,
+      "step": 1055
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3504468067105522,
+      "learning_rate": 0.00016861950960371725,
+      "loss": 0.6571,
+      "step": 1056
+    },
+    {
+      "epoch": 0.28186666666666665,
+      "grad_norm": 0.37484424032181257,
+      "learning_rate": 0.00016855665064316878,
+      "loss": 0.6797,
+      "step": 1057
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.3619203324005068,
+      "learning_rate": 0.00016849374053060982,
+      "loss": 0.6864,
+      "step": 1058
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.3919017460182505,
+      "learning_rate": 0.00016843077931297931,
+      "loss": 0.6688,
+      "step": 1059
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.36523376713834776,
+      "learning_rate": 0.0001683677670372544,
+      "loss": 0.6491,
+      "step": 1060
+    },
+    {
+      "epoch": 0.2829333333333333,
+      "grad_norm": 0.4042315615415427,
+      "learning_rate": 0.00016830470375045026,
+      "loss": 0.7029,
+      "step": 1061
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.38412192749748014,
+      "learning_rate": 0.0001682415894996201,
+      "loss": 0.7355,
+      "step": 1062
+    },
+    {
+      "epoch": 0.28346666666666664,
+      "grad_norm": 0.36013514606028135,
+      "learning_rate": 0.0001681784243318553,
+      "loss": 0.7206,
+      "step": 1063
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.3732908816050375,
+      "learning_rate": 0.0001681152082942851,
+      "loss": 0.6888,
+      "step": 1064
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.36299458065614304,
+      "learning_rate": 0.0001680519414340767,
+      "loss": 0.6489,
+      "step": 1065
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.3572414467072343,
+      "learning_rate": 0.0001679886237984353,
+      "loss": 0.6226,
+      "step": 1066
+    },
+    {
+      "epoch": 0.28453333333333336,
+      "grad_norm": 0.36499664268910215,
+      "learning_rate": 0.00016792525543460386,
+      "loss": 0.6657,
+      "step": 1067
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4036141052244234,
+      "learning_rate": 0.00016786183638986337,
+      "loss": 0.6645,
+      "step": 1068
+    },
+    {
+      "epoch": 0.2850666666666667,
+      "grad_norm": 0.37185305842029137,
+      "learning_rate": 0.00016779836671153246,
+      "loss": 0.6739,
+      "step": 1069
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.35674199303735904,
+      "learning_rate": 0.00016773484644696764,
+      "loss": 0.7012,
+      "step": 1070
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.3459982007445063,
+      "learning_rate": 0.0001676712756435631,
+      "loss": 0.6454,
+      "step": 1071
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.36779706444281307,
+      "learning_rate": 0.0001676076543487508,
+      "loss": 0.692,
+      "step": 1072
+    },
+    {
+      "epoch": 0.28613333333333335,
+      "grad_norm": 0.35273160102593354,
+      "learning_rate": 0.00016754398261000037,
+      "loss": 0.6622,
+      "step": 1073
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.3453388756501663,
+      "learning_rate": 0.000167480260474819,
+      "loss": 0.6803,
+      "step": 1074
+    },
+    {
+      "epoch": 0.2866666666666667,
+      "grad_norm": 0.3609617910598972,
+      "learning_rate": 0.00016741648799075158,
+      "loss": 0.6633,
+      "step": 1075
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.339521677038658,
+      "learning_rate": 0.00016735266520538048,
+      "loss": 0.635,
+      "step": 1076
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.36655723439723775,
+      "learning_rate": 0.00016728879216632567,
+      "loss": 0.6184,
+      "step": 1077
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.36111905308044123,
+      "learning_rate": 0.00016722486892124455,
+      "loss": 0.6739,
+      "step": 1078
+    },
+    {
+      "epoch": 0.28773333333333334,
+      "grad_norm": 0.397710445374459,
+      "learning_rate": 0.0001671608955178321,
+      "loss": 0.6854,
+      "step": 1079
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3707630546517936,
+      "learning_rate": 0.0001670968720038206,
+      "loss": 0.6103,
+      "step": 1080
+    },
+    {
+      "epoch": 0.28826666666666667,
+      "grad_norm": 0.3710674802693859,
+      "learning_rate": 0.00016703279842697973,
+      "loss": 0.6735,
+      "step": 1081
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.3643854446717735,
+      "learning_rate": 0.00016696867483511656,
+      "loss": 0.5974,
+      "step": 1082
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.35067221992190195,
+      "learning_rate": 0.00016690450127607553,
+      "loss": 0.6635,
+      "step": 1083
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.3621828416846456,
+      "learning_rate": 0.00016684027779773826,
+      "loss": 0.6324,
+      "step": 1084
+    },
+    {
+      "epoch": 0.28933333333333333,
+      "grad_norm": 0.37352912194864135,
+      "learning_rate": 0.00016677600444802365,
+      "loss": 0.6739,
+      "step": 1085
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.37039670950564546,
+      "learning_rate": 0.00016671168127488785,
+      "loss": 0.6924,
+      "step": 1086
+    },
+    {
+      "epoch": 0.28986666666666666,
+      "grad_norm": 0.34511970312909246,
+      "learning_rate": 0.00016664730832632415,
+      "loss": 0.5947,
+      "step": 1087
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.35705567958317896,
+      "learning_rate": 0.000166582885650363,
+      "loss": 0.6652,
+      "step": 1088
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.3557212822445767,
+      "learning_rate": 0.0001665184132950719,
+      "loss": 0.6948,
+      "step": 1089
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.3273863887034328,
+      "learning_rate": 0.00016645389130855547,
+      "loss": 0.6696,
+      "step": 1090
+    },
+    {
+      "epoch": 0.2909333333333333,
+      "grad_norm": 0.35539875700521045,
+      "learning_rate": 0.00016638931973895536,
+      "loss": 0.6458,
+      "step": 1091
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.36460568537688526,
+      "learning_rate": 0.0001663246986344502,
+      "loss": 0.657,
+      "step": 1092
+    },
+    {
+      "epoch": 0.29146666666666665,
+      "grad_norm": 0.36649558584500275,
+      "learning_rate": 0.00016626002804325557,
+      "loss": 0.6609,
+      "step": 1093
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.39106219168027234,
+      "learning_rate": 0.00016619530801362394,
+      "loss": 0.6779,
+      "step": 1094
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.3491447468740579,
+      "learning_rate": 0.00016613053859384485,
+      "loss": 0.6291,
+      "step": 1095
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.3671723824209041,
+      "learning_rate": 0.0001660657198322444,
+      "loss": 0.6181,
+      "step": 1096
+    },
+    {
+      "epoch": 0.2925333333333333,
+      "grad_norm": 0.3810939764036908,
+      "learning_rate": 0.00016600085177718573,
+      "loss": 0.6865,
+      "step": 1097
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.3275163187792605,
+      "learning_rate": 0.00016593593447706865,
+      "loss": 0.617,
+      "step": 1098
+    },
+    {
+      "epoch": 0.29306666666666664,
+      "grad_norm": 0.3519344941542276,
+      "learning_rate": 0.00016587096798032983,
+      "loss": 0.6611,
+      "step": 1099
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.36470694237855455,
+      "learning_rate": 0.00016580595233544248,
+      "loss": 0.7194,
+      "step": 1100
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.3651018255813527,
+      "learning_rate": 0.00016574088759091664,
+      "loss": 0.6889,
+      "step": 1101
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.39625683516580695,
+      "learning_rate": 0.00016567577379529883,
+      "loss": 0.7292,
+      "step": 1102
+    },
+    {
+      "epoch": 0.29413333333333336,
+      "grad_norm": 0.37248125147916067,
+      "learning_rate": 0.00016561061099717235,
+      "loss": 0.6468,
+      "step": 1103
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.34616853968603206,
+      "learning_rate": 0.00016554539924515686,
+      "loss": 0.6406,
+      "step": 1104
+    },
+    {
+      "epoch": 0.2946666666666667,
+      "grad_norm": 0.36652518870161205,
+      "learning_rate": 0.00016548013858790875,
+      "loss": 0.6616,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.39987104771015125,
+      "learning_rate": 0.0001654148290741207,
+      "loss": 0.7032,
+      "step": 1106
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.3288161117550961,
+      "learning_rate": 0.00016534947075252203,
+      "loss": 0.607,
+      "step": 1107
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.37883166858825185,
+      "learning_rate": 0.00016528406367187834,
+      "loss": 0.6274,
+      "step": 1108
+    },
+    {
+      "epoch": 0.29573333333333335,
+      "grad_norm": 0.38880499843762534,
+      "learning_rate": 0.00016521860788099165,
+      "loss": 0.6164,
+      "step": 1109
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.37510296895333245,
+      "learning_rate": 0.0001651531034287004,
+      "loss": 0.67,
+      "step": 1110
+    },
+    {
+      "epoch": 0.2962666666666667,
+      "grad_norm": 0.3728715451387841,
+      "learning_rate": 0.00016508755036387919,
+      "loss": 0.6589,
+      "step": 1111
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.36323250640918014,
+      "learning_rate": 0.000165021948735439,
+      "loss": 0.61,
+      "step": 1112
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.3870463288023288,
+      "learning_rate": 0.00016495629859232704,
+      "loss": 0.6544,
+      "step": 1113
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.3605244652694021,
+      "learning_rate": 0.00016489059998352668,
+      "loss": 0.6255,
+      "step": 1114
+    },
+    {
+      "epoch": 0.29733333333333334,
+      "grad_norm": 0.38127211315090653,
+      "learning_rate": 0.00016482485295805748,
+      "loss": 0.6354,
+      "step": 1115
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.37297391747401476,
+      "learning_rate": 0.00016475905756497506,
+      "loss": 0.6344,
+      "step": 1116
+    },
+    {
+      "epoch": 0.29786666666666667,
+      "grad_norm": 0.36574361802191735,
+      "learning_rate": 0.00016469321385337123,
+      "loss": 0.6768,
+      "step": 1117
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.349897972234985,
+      "learning_rate": 0.0001646273218723738,
+      "loss": 0.6278,
+      "step": 1118
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.35673503680421903,
+      "learning_rate": 0.00016456138167114656,
+      "loss": 0.663,
+      "step": 1119
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.3646284802005334,
+      "learning_rate": 0.00016449539329888935,
+      "loss": 0.688,
+      "step": 1120
+    },
+    {
+      "epoch": 0.29893333333333333,
+      "grad_norm": 0.37942823954405064,
+      "learning_rate": 0.0001644293568048379,
+      "loss": 0.648,
+      "step": 1121
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.3706131772317103,
+      "learning_rate": 0.00016436327223826389,
+      "loss": 0.6872,
+      "step": 1122
+    },
+    {
+      "epoch": 0.29946666666666666,
+      "grad_norm": 0.363918550349555,
+      "learning_rate": 0.00016429713964847482,
+      "loss": 0.6653,
+      "step": 1123
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.37357927487684806,
+      "learning_rate": 0.00016423095908481403,
+      "loss": 0.6309,
+      "step": 1124
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4160864510634148,
+      "learning_rate": 0.00016416473059666065,
+      "loss": 0.7171,
+      "step": 1125
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.3524992252789292,
+      "learning_rate": 0.00016409845423342968,
+      "loss": 0.5971,
+      "step": 1126
+    },
+    {
+      "epoch": 0.3005333333333333,
+      "grad_norm": 0.36655727780622716,
+      "learning_rate": 0.00016403213004457162,
+      "loss": 0.6747,
+      "step": 1127
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.36583303447162424,
+      "learning_rate": 0.00016396575807957285,
+      "loss": 0.6894,
+      "step": 1128
+    },
+    {
+      "epoch": 0.30106666666666665,
+      "grad_norm": 0.4067169112555897,
+      "learning_rate": 0.00016389933838795534,
+      "loss": 0.7781,
+      "step": 1129
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.3664677596940745,
+      "learning_rate": 0.0001638328710192766,
+      "loss": 0.6704,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.37859485625377665,
+      "learning_rate": 0.00016376635602312982,
+      "loss": 0.6546,
+      "step": 1131
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.35192579221280346,
+      "learning_rate": 0.00016369979344914363,
+      "loss": 0.6297,
+      "step": 1132
+    },
+    {
+      "epoch": 0.3021333333333333,
+      "grad_norm": 0.34384956160748753,
+      "learning_rate": 0.00016363318334698223,
+      "loss": 0.6177,
+      "step": 1133
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4504626611019618,
+      "learning_rate": 0.00016356652576634528,
+      "loss": 0.6942,
+      "step": 1134
+    },
+    {
+      "epoch": 0.30266666666666664,
+      "grad_norm": 0.39280597953969865,
+      "learning_rate": 0.0001634998207569678,
+      "loss": 0.6659,
+      "step": 1135
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.3679328159231638,
+      "learning_rate": 0.00016343306836862027,
+      "loss": 0.7045,
+      "step": 1136
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.35151668026257765,
+      "learning_rate": 0.00016336626865110843,
+      "loss": 0.6501,
+      "step": 1137
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.35385315501283787,
+      "learning_rate": 0.0001632994216542735,
+      "loss": 0.6332,
+      "step": 1138
+    },
+    {
+      "epoch": 0.30373333333333336,
+      "grad_norm": 0.36363587618583665,
+      "learning_rate": 0.00016323252742799182,
+      "loss": 0.6315,
+      "step": 1139
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3948592600839174,
+      "learning_rate": 0.000163165586022175,
+      "loss": 0.6482,
+      "step": 1140
+    },
+    {
+      "epoch": 0.3042666666666667,
+      "grad_norm": 0.36302246509769004,
+      "learning_rate": 0.00016309859748676983,
+      "loss": 0.6671,
+      "step": 1141
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.3779194376140945,
+      "learning_rate": 0.00016303156187175843,
+      "loss": 0.6871,
+      "step": 1142
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.36192175473327054,
+      "learning_rate": 0.0001629644792271578,
+      "loss": 0.6475,
+      "step": 1143
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.37289607890822235,
+      "learning_rate": 0.00016289734960302026,
+      "loss": 0.6949,
+      "step": 1144
+    },
+    {
+      "epoch": 0.30533333333333335,
+      "grad_norm": 0.3523417041461089,
+      "learning_rate": 0.00016283017304943295,
+      "loss": 0.628,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.3551378752338613,
+      "learning_rate": 0.0001627629496165183,
+      "loss": 0.6634,
+      "step": 1146
+    },
+    {
+      "epoch": 0.3058666666666667,
+      "grad_norm": 0.3472197515319477,
+      "learning_rate": 0.0001626956793544335,
+      "loss": 0.6865,
+      "step": 1147
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.35780980480311225,
+      "learning_rate": 0.00016262836231337071,
+      "loss": 0.6342,
+      "step": 1148
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.338062986445356,
+      "learning_rate": 0.00016256099854355707,
+      "loss": 0.6323,
+      "step": 1149
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.3659727126389936,
+      "learning_rate": 0.00016249358809525456,
+      "loss": 0.6675,
+      "step": 1150
+    },
+    {
+      "epoch": 0.30693333333333334,
+      "grad_norm": 0.3795725157114804,
+      "learning_rate": 0.00016242613101876,
+      "loss": 0.6663,
+      "step": 1151
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.35794199426970963,
+      "learning_rate": 0.00016235862736440487,
+      "loss": 0.6523,
+      "step": 1152
+    },
+    {
+      "epoch": 0.30746666666666667,
+      "grad_norm": 0.3642950023320711,
+      "learning_rate": 0.00016229107718255566,
+      "loss": 0.6538,
+      "step": 1153
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.36534479278477566,
+      "learning_rate": 0.00016222348052361333,
+      "loss": 0.6342,
+      "step": 1154
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.3740616755218169,
+      "learning_rate": 0.0001621558374380136,
+      "loss": 0.6854,
+      "step": 1155
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.38679655278656966,
+      "learning_rate": 0.00016208814797622693,
+      "loss": 0.6466,
+      "step": 1156
+    },
+    {
+      "epoch": 0.3085333333333333,
+      "grad_norm": 0.37379508723873567,
+      "learning_rate": 0.00016202041218875825,
+      "loss": 0.6745,
+      "step": 1157
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.3506501397117617,
+      "learning_rate": 0.00016195263012614705,
+      "loss": 0.6428,
+      "step": 1158
+    },
+    {
+      "epoch": 0.30906666666666666,
+      "grad_norm": 0.3842823121392134,
+      "learning_rate": 0.0001618848018389675,
+      "loss": 0.6984,
+      "step": 1159
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.3680409119623642,
+      "learning_rate": 0.00016181692737782808,
+      "loss": 0.6595,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.38458618762078806,
+      "learning_rate": 0.00016174900679337184,
+      "loss": 0.6032,
+      "step": 1161
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.3822741148204659,
+      "learning_rate": 0.0001616810401362762,
+      "loss": 0.6991,
+      "step": 1162
+    },
+    {
+      "epoch": 0.3101333333333333,
+      "grad_norm": 0.3335556786759688,
+      "learning_rate": 0.00016161302745725293,
+      "loss": 0.6173,
+      "step": 1163
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.33459018128300577,
+      "learning_rate": 0.00016154496880704819,
+      "loss": 0.6514,
+      "step": 1164
+    },
+    {
+      "epoch": 0.31066666666666665,
+      "grad_norm": 0.35310490386718985,
+      "learning_rate": 0.00016147686423644242,
+      "loss": 0.6627,
+      "step": 1165
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.38067503455267376,
+      "learning_rate": 0.00016140871379625033,
+      "loss": 0.6253,
+      "step": 1166
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.3959262621372224,
+      "learning_rate": 0.00016134051753732083,
+      "loss": 0.6652,
+      "step": 1167
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.34540358267945126,
+      "learning_rate": 0.00016127227551053703,
+      "loss": 0.6386,
+      "step": 1168
+    },
+    {
+      "epoch": 0.3117333333333333,
+      "grad_norm": 0.34445275213733884,
+      "learning_rate": 0.0001612039877668162,
+      "loss": 0.6275,
+      "step": 1169
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.3679875177944374,
+      "learning_rate": 0.00016113565435710972,
+      "loss": 0.7062,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3122666666666667,
+      "grad_norm": 0.3822147893078014,
+      "learning_rate": 0.00016106727533240302,
+      "loss": 0.6532,
+      "step": 1171
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.36408722525935105,
+      "learning_rate": 0.00016099885074371558,
+      "loss": 0.6316,
+      "step": 1172
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.35163563376622164,
+      "learning_rate": 0.0001609303806421009,
+      "loss": 0.6348,
+      "step": 1173
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.36364646710195,
+      "learning_rate": 0.00016086186507864635,
+      "loss": 0.6776,
+      "step": 1174
+    },
+    {
+      "epoch": 0.31333333333333335,
+      "grad_norm": 0.3333509329366065,
+      "learning_rate": 0.00016079330410447335,
+      "loss": 0.6054,
+      "step": 1175
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.351290564829832,
+      "learning_rate": 0.0001607246977707371,
+      "loss": 0.6479,
+      "step": 1176
+    },
+    {
+      "epoch": 0.3138666666666667,
+      "grad_norm": 0.3576339066147463,
+      "learning_rate": 0.00016065604612862676,
+      "loss": 0.6779,
+      "step": 1177
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.3511175504088521,
+      "learning_rate": 0.00016058734922936506,
+      "loss": 0.6494,
+      "step": 1178
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.3694486746759354,
+      "learning_rate": 0.00016051860712420877,
+      "loss": 0.6912,
+      "step": 1179
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.36096748149280555,
+      "learning_rate": 0.00016044981986444826,
+      "loss": 0.6732,
+      "step": 1180
+    },
+    {
+      "epoch": 0.31493333333333334,
+      "grad_norm": 0.35219851339863323,
+      "learning_rate": 0.0001603809875014076,
+      "loss": 0.6236,
+      "step": 1181
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3765659705700948,
+      "learning_rate": 0.00016031211008644448,
+      "loss": 0.6985,
+      "step": 1182
+    },
+    {
+      "epoch": 0.3154666666666667,
+      "grad_norm": 0.3695985043120348,
+      "learning_rate": 0.0001602431876709503,
+      "loss": 0.7035,
+      "step": 1183
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.3608009123050179,
+      "learning_rate": 0.00016017422030634992,
+      "loss": 0.6399,
+      "step": 1184
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.3748073320657708,
+      "learning_rate": 0.00016010520804410184,
+      "loss": 0.6675,
+      "step": 1185
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.3714347092962332,
+      "learning_rate": 0.000160036150935698,
+      "loss": 0.6843,
+      "step": 1186
+    },
+    {
+      "epoch": 0.31653333333333333,
+      "grad_norm": 0.3928369814244889,
+      "learning_rate": 0.00015996704903266382,
+      "loss": 0.6792,
+      "step": 1187
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3710624996331732,
+      "learning_rate": 0.0001598979023865581,
+      "loss": 0.6291,
+      "step": 1188
+    },
+    {
+      "epoch": 0.31706666666666666,
+      "grad_norm": 0.34999745773625557,
+      "learning_rate": 0.00015982871104897313,
+      "loss": 0.626,
+      "step": 1189
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.38243522357450194,
+      "learning_rate": 0.00015975947507153443,
+      "loss": 0.6929,
+      "step": 1190
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.3630642104901653,
+      "learning_rate": 0.00015969019450590087,
+      "loss": 0.6444,
+      "step": 1191
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.35883144312578763,
+      "learning_rate": 0.00015962086940376463,
+      "loss": 0.6708,
+      "step": 1192
+    },
+    {
+      "epoch": 0.3181333333333333,
+      "grad_norm": 0.37130043786712597,
+      "learning_rate": 0.00015955149981685107,
+      "loss": 0.6878,
+      "step": 1193
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.34879606254394946,
+      "learning_rate": 0.00015948208579691877,
+      "loss": 0.6589,
+      "step": 1194
+    },
+    {
+      "epoch": 0.31866666666666665,
+      "grad_norm": 0.3592033146179873,
+      "learning_rate": 0.00015941262739575937,
+      "loss": 0.649,
+      "step": 1195
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.34879313258280176,
+      "learning_rate": 0.0001593431246651978,
+      "loss": 0.6651,
+      "step": 1196
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.3554203164586111,
+      "learning_rate": 0.0001592735776570919,
+      "loss": 0.6704,
+      "step": 1197
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.35190883617460833,
+      "learning_rate": 0.00015920398642333265,
+      "loss": 0.6956,
+      "step": 1198
+    },
+    {
+      "epoch": 0.3197333333333333,
+      "grad_norm": 0.370121617005598,
+      "learning_rate": 0.00015913435101584398,
+      "loss": 0.6992,
+      "step": 1199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3360198400648102,
+      "learning_rate": 0.0001590646714865828,
+      "loss": 0.6296,
+      "step": 1200
+    },
+    {
+      "epoch": 0.32026666666666664,
+      "grad_norm": 0.35623368554828544,
+      "learning_rate": 0.00015899494788753892,
+      "loss": 0.6498,
+      "step": 1201
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.37256055318413156,
+      "learning_rate": 0.00015892518027073505,
+      "loss": 0.6367,
+      "step": 1202
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.3627845573248268,
+      "learning_rate": 0.00015885536868822671,
+      "loss": 0.6841,
+      "step": 1203
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.36019386889032073,
+      "learning_rate": 0.00015878551319210228,
+      "loss": 0.6096,
+      "step": 1204
+    },
+    {
+      "epoch": 0.32133333333333336,
+      "grad_norm": 0.3574885966722773,
+      "learning_rate": 0.00015871561383448286,
+      "loss": 0.6442,
+      "step": 1205
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.3563774914852631,
+      "learning_rate": 0.0001586456706675223,
+      "loss": 0.6508,
+      "step": 1206
+    },
+    {
+      "epoch": 0.3218666666666667,
+      "grad_norm": 0.3662607386148354,
+      "learning_rate": 0.00015857568374340713,
+      "loss": 0.6518,
+      "step": 1207
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.38425098823181136,
+      "learning_rate": 0.00015850565311435652,
+      "loss": 0.6681,
+      "step": 1208
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.3606066742289815,
+      "learning_rate": 0.00015843557883262225,
+      "loss": 0.6464,
+      "step": 1209
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.3485835343050614,
+      "learning_rate": 0.0001583654609504887,
+      "loss": 0.6415,
+      "step": 1210
+    },
+    {
+      "epoch": 0.32293333333333335,
+      "grad_norm": 0.3433252913598194,
+      "learning_rate": 0.00015829529952027276,
+      "loss": 0.6472,
+      "step": 1211
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.38180169579767054,
+      "learning_rate": 0.00015822509459432379,
+      "loss": 0.6407,
+      "step": 1212
+    },
+    {
+      "epoch": 0.3234666666666667,
+      "grad_norm": 0.3536578967226547,
+      "learning_rate": 0.0001581548462250236,
+      "loss": 0.6363,
+      "step": 1213
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.36302717425259284,
+      "learning_rate": 0.00015808455446478646,
+      "loss": 0.6048,
+      "step": 1214
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.38596616795709465,
+      "learning_rate": 0.00015801421936605903,
+      "loss": 0.6256,
+      "step": 1215
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.39008874153124756,
+      "learning_rate": 0.00015794384098132027,
+      "loss": 0.6775,
+      "step": 1216
+    },
+    {
+      "epoch": 0.32453333333333334,
+      "grad_norm": 0.3599031025738248,
+      "learning_rate": 0.00015787341936308134,
+      "loss": 0.6753,
+      "step": 1217
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.3666623377024128,
+      "learning_rate": 0.00015780295456388588,
+      "loss": 0.6514,
+      "step": 1218
+    },
+    {
+      "epoch": 0.32506666666666667,
+      "grad_norm": 0.3764398383093878,
+      "learning_rate": 0.00015773244663630953,
+      "loss": 0.6318,
+      "step": 1219
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.3616039006983256,
+      "learning_rate": 0.00015766189563296029,
+      "loss": 0.6895,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.36386024728183697,
+      "learning_rate": 0.0001575913016064781,
+      "loss": 0.6189,
+      "step": 1221
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.37325472624912076,
+      "learning_rate": 0.0001575206646095352,
+      "loss": 0.6583,
+      "step": 1222
+    },
+    {
+      "epoch": 0.32613333333333333,
+      "grad_norm": 0.38291898189334206,
+      "learning_rate": 0.00015744998469483575,
+      "loss": 0.6421,
+      "step": 1223
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3653212094574066,
+      "learning_rate": 0.00015737926191511606,
+      "loss": 0.653,
+      "step": 1224
+    },
+    {
+      "epoch": 0.32666666666666666,
+      "grad_norm": 0.37838881194679863,
+      "learning_rate": 0.00015730849632314428,
+      "loss": 0.7042,
+      "step": 1225
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.3851919203073385,
+      "learning_rate": 0.0001572376879717206,
+      "loss": 0.6746,
+      "step": 1226
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.3802774289463264,
+      "learning_rate": 0.00015716683691367704,
+      "loss": 0.64,
+      "step": 1227
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.39424919350285487,
+      "learning_rate": 0.0001570959432018776,
+      "loss": 0.7212,
+      "step": 1228
+    },
+    {
+      "epoch": 0.3277333333333333,
+      "grad_norm": 0.3620130208318971,
+      "learning_rate": 0.00015702500688921805,
+      "loss": 0.6527,
+      "step": 1229
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.37466286399006393,
+      "learning_rate": 0.00015695402802862584,
+      "loss": 0.6102,
+      "step": 1230
+    },
+    {
+      "epoch": 0.32826666666666665,
+      "grad_norm": 0.345108202972318,
+      "learning_rate": 0.00015688300667306032,
+      "loss": 0.6793,
+      "step": 1231
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.3720730074945737,
+      "learning_rate": 0.0001568119428755125,
+      "loss": 0.6619,
+      "step": 1232
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.3569757717493977,
+      "learning_rate": 0.000156740836689005,
+      "loss": 0.6378,
+      "step": 1233
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.3598068042732101,
+      "learning_rate": 0.0001566696881665921,
+      "loss": 0.639,
+      "step": 1234
+    },
+    {
+      "epoch": 0.3293333333333333,
+      "grad_norm": 0.3872322396630404,
+      "learning_rate": 0.00015659849736135976,
+      "loss": 0.6437,
+      "step": 1235
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4061654621552088,
+      "learning_rate": 0.00015652726432642533,
+      "loss": 0.7233,
+      "step": 1236
+    },
+    {
+      "epoch": 0.32986666666666664,
+      "grad_norm": 0.37738913064383833,
+      "learning_rate": 0.00015645598911493775,
+      "loss": 0.6551,
+      "step": 1237
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.3429208761962747,
+      "learning_rate": 0.00015638467178007742,
+      "loss": 0.6359,
+      "step": 1238
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.38534787195578357,
+      "learning_rate": 0.00015631331237505623,
+      "loss": 0.6529,
+      "step": 1239
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.3566870640114799,
+      "learning_rate": 0.00015624191095311735,
+      "loss": 0.6334,
+      "step": 1240
+    },
+    {
+      "epoch": 0.33093333333333336,
+      "grad_norm": 0.3507989735290889,
+      "learning_rate": 0.00015617046756753538,
+      "loss": 0.6648,
+      "step": 1241
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3804327588695725,
+      "learning_rate": 0.00015609898227161617,
+      "loss": 0.5657,
+      "step": 1242
+    },
+    {
+      "epoch": 0.3314666666666667,
+      "grad_norm": 0.36782873077642625,
+      "learning_rate": 0.00015602745511869692,
+      "loss": 0.6844,
+      "step": 1243
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.37440910177301917,
+      "learning_rate": 0.00015595588616214596,
+      "loss": 0.6117,
+      "step": 1244
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.38262043407727425,
+      "learning_rate": 0.0001558842754553629,
+      "loss": 0.6493,
+      "step": 1245
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.34683869042386345,
+      "learning_rate": 0.00015581262305177846,
+      "loss": 0.6775,
+      "step": 1246
+    },
+    {
+      "epoch": 0.33253333333333335,
+      "grad_norm": 0.3519959092207261,
+      "learning_rate": 0.0001557409290048545,
+      "loss": 0.6262,
+      "step": 1247
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.35762407123056417,
+      "learning_rate": 0.00015566919336808388,
+      "loss": 0.587,
+      "step": 1248
+    },
+    {
+      "epoch": 0.3330666666666667,
+      "grad_norm": 0.3734952689135648,
+      "learning_rate": 0.0001555974161949906,
+      "loss": 0.6658,
+      "step": 1249
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.36249522777359267,
+      "learning_rate": 0.00015552559753912953,
+      "loss": 0.6504,
+      "step": 1250
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.35338497034278515,
+      "learning_rate": 0.00015545373745408657,
+      "loss": 0.6346,
+      "step": 1251
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.3941213271584644,
+      "learning_rate": 0.00015538183599347853,
+      "loss": 0.6894,
+      "step": 1252
+    },
+    {
+      "epoch": 0.33413333333333334,
+      "grad_norm": 0.37980684186528246,
+      "learning_rate": 0.00015530989321095308,
+      "loss": 0.6465,
+      "step": 1253
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3386191752029083,
+      "learning_rate": 0.0001552379091601887,
+      "loss": 0.6091,
+      "step": 1254
+    },
+    {
+      "epoch": 0.33466666666666667,
+      "grad_norm": 0.35499212515740847,
+      "learning_rate": 0.0001551658838948947,
+      "loss": 0.6708,
+      "step": 1255
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.3507762267790256,
+      "learning_rate": 0.0001550938174688111,
+      "loss": 0.6653,
+      "step": 1256
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.3509622875990821,
+      "learning_rate": 0.00015502170993570864,
+      "loss": 0.6546,
+      "step": 1257
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.3683119642145877,
+      "learning_rate": 0.00015494956134938875,
+      "loss": 0.6768,
+      "step": 1258
+    },
+    {
+      "epoch": 0.33573333333333333,
+      "grad_norm": 0.33967456657685374,
+      "learning_rate": 0.00015487737176368352,
+      "loss": 0.6324,
+      "step": 1259
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3490156070968416,
+      "learning_rate": 0.00015480514123245555,
+      "loss": 0.6077,
+      "step": 1260
+    },
+    {
+      "epoch": 0.33626666666666666,
+      "grad_norm": 0.37065388362132234,
+      "learning_rate": 0.00015473286980959805,
+      "loss": 0.6609,
+      "step": 1261
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.36879952858462517,
+      "learning_rate": 0.0001546605575490347,
+      "loss": 0.614,
+      "step": 1262
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.36123447052475727,
+      "learning_rate": 0.00015458820450471974,
+      "loss": 0.6546,
+      "step": 1263
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.35536305457711176,
+      "learning_rate": 0.0001545158107306377,
+      "loss": 0.6206,
+      "step": 1264
+    },
+    {
+      "epoch": 0.3373333333333333,
+      "grad_norm": 0.3953199215685226,
+      "learning_rate": 0.00015444337628080362,
+      "loss": 0.6641,
+      "step": 1265
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.3625484098950176,
+      "learning_rate": 0.00015437090120926284,
+      "loss": 0.6325,
+      "step": 1266
+    },
+    {
+      "epoch": 0.33786666666666665,
+      "grad_norm": 0.3558095021789365,
+      "learning_rate": 0.000154298385570091,
+      "loss": 0.6268,
+      "step": 1267
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.36446549265199385,
+      "learning_rate": 0.00015422582941739397,
+      "loss": 0.6534,
+      "step": 1268
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.35619854430761033,
+      "learning_rate": 0.00015415323280530802,
+      "loss": 0.6651,
+      "step": 1269
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.35900610187354576,
+      "learning_rate": 0.0001540805957879994,
+      "loss": 0.6826,
+      "step": 1270
+    },
+    {
+      "epoch": 0.3389333333333333,
+      "grad_norm": 0.36417732187548524,
+      "learning_rate": 0.00015400791841966465,
+      "loss": 0.6685,
+      "step": 1271
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.38437561051333186,
+      "learning_rate": 0.00015393520075453028,
+      "loss": 0.6264,
+      "step": 1272
+    },
+    {
+      "epoch": 0.3394666666666667,
+      "grad_norm": 0.34756552123783735,
+      "learning_rate": 0.00015386244284685302,
+      "loss": 0.6493,
+      "step": 1273
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.3785874610424342,
+      "learning_rate": 0.0001537896447509195,
+      "loss": 0.6908,
+      "step": 1274
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.3669786971937423,
+      "learning_rate": 0.00015371680652104643,
+      "loss": 0.6365,
+      "step": 1275
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.36032875005693216,
+      "learning_rate": 0.00015364392821158043,
+      "loss": 0.6482,
+      "step": 1276
+    },
+    {
+      "epoch": 0.34053333333333335,
+      "grad_norm": 0.3599419557811629,
+      "learning_rate": 0.000153571009876898,
+      "loss": 0.6258,
+      "step": 1277
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.3479833871068561,
+      "learning_rate": 0.00015349805157140553,
+      "loss": 0.6301,
+      "step": 1278
+    },
+    {
+      "epoch": 0.3410666666666667,
+      "grad_norm": 0.3545384568178658,
+      "learning_rate": 0.0001534250533495392,
+      "loss": 0.6587,
+      "step": 1279
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.3412159189978176,
+      "learning_rate": 0.00015335201526576507,
+      "loss": 0.5896,
+      "step": 1280
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.34608949708731057,
+      "learning_rate": 0.0001532789373745788,
+      "loss": 0.6747,
+      "step": 1281
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.3531856889857697,
+      "learning_rate": 0.00015320581973050587,
+      "loss": 0.7002,
+      "step": 1282
+    },
+    {
+      "epoch": 0.34213333333333334,
+      "grad_norm": 0.38405613464458355,
+      "learning_rate": 0.00015313266238810132,
+      "loss": 0.6717,
+      "step": 1283
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.38748352533666336,
+      "learning_rate": 0.0001530594654019499,
+      "loss": 0.6092,
+      "step": 1284
+    },
+    {
+      "epoch": 0.3426666666666667,
+      "grad_norm": 0.3713696754854172,
+      "learning_rate": 0.0001529862288266659,
+      "loss": 0.6451,
+      "step": 1285
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.35484028813406376,
+      "learning_rate": 0.00015291295271689317,
+      "loss": 0.6284,
+      "step": 1286
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.36492832853935714,
+      "learning_rate": 0.000152839637127305,
+      "loss": 0.6306,
+      "step": 1287
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.380289161842323,
+      "learning_rate": 0.00015276628211260423,
+      "loss": 0.6709,
+      "step": 1288
+    },
+    {
+      "epoch": 0.34373333333333334,
+      "grad_norm": 0.37607487520761307,
+      "learning_rate": 0.00015269288772752298,
+      "loss": 0.6409,
+      "step": 1289
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.36833212471802784,
+      "learning_rate": 0.00015261945402682292,
+      "loss": 0.6925,
+      "step": 1290
+    },
+    {
+      "epoch": 0.34426666666666667,
+      "grad_norm": 0.3426238169547332,
+      "learning_rate": 0.0001525459810652949,
+      "loss": 0.6699,
+      "step": 1291
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.348530214367812,
+      "learning_rate": 0.00015247246889775915,
+      "loss": 0.6038,
+      "step": 1292
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.3630556132797608,
+      "learning_rate": 0.00015239891757906507,
+      "loss": 0.6411,
+      "step": 1293
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.3623795702351125,
+      "learning_rate": 0.00015232532716409148,
+      "loss": 0.6337,
+      "step": 1294
+    },
+    {
+      "epoch": 0.3453333333333333,
+      "grad_norm": 0.355797002464872,
+      "learning_rate": 0.00015225169770774605,
+      "loss": 0.6704,
+      "step": 1295
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3732979701519832,
+      "learning_rate": 0.00015217802926496585,
+      "loss": 0.6693,
+      "step": 1296
+    },
+    {
+      "epoch": 0.34586666666666666,
+      "grad_norm": 0.35294389401419496,
+      "learning_rate": 0.0001521043218907169,
+      "loss": 0.6704,
+      "step": 1297
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.3978208670100884,
+      "learning_rate": 0.00015203057563999438,
+      "loss": 0.6745,
+      "step": 1298
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.34707304418306456,
+      "learning_rate": 0.00015195679056782227,
+      "loss": 0.6377,
+      "step": 1299
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.35806543959566406,
+      "learning_rate": 0.00015188296672925377,
+      "loss": 0.6831,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3469333333333333,
+      "grad_norm": 0.35503239038303996,
+      "learning_rate": 0.00015180910417937084,
+      "loss": 0.6027,
+      "step": 1301
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.6052900851469462,
+      "learning_rate": 0.00015173520297328438,
+      "loss": 0.6019,
+      "step": 1302
+    },
+    {
+      "epoch": 0.34746666666666665,
+      "grad_norm": 0.3911571636520369,
+      "learning_rate": 0.0001516612631661341,
+      "loss": 0.7034,
+      "step": 1303
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.3592736386518935,
+      "learning_rate": 0.00015158728481308852,
+      "loss": 0.6251,
+      "step": 1304
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.37374460172904495,
+      "learning_rate": 0.00015151326796934497,
+      "loss": 0.666,
+      "step": 1305
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.34285443302059637,
+      "learning_rate": 0.0001514392126901295,
+      "loss": 0.5971,
+      "step": 1306
+    },
+    {
+      "epoch": 0.3485333333333333,
+      "grad_norm": 0.34531508430173563,
+      "learning_rate": 0.0001513651190306967,
+      "loss": 0.6218,
+      "step": 1307
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.35986072901583555,
+      "learning_rate": 0.00015129098704632995,
+      "loss": 0.6438,
+      "step": 1308
+    },
+    {
+      "epoch": 0.3490666666666667,
+      "grad_norm": 0.3553678748811404,
+      "learning_rate": 0.00015121681679234112,
+      "loss": 0.6427,
+      "step": 1309
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.3568476936052233,
+      "learning_rate": 0.0001511426083240708,
+      "loss": 0.6456,
+      "step": 1310
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.33901440501681673,
+      "learning_rate": 0.00015106836169688788,
+      "loss": 0.6194,
+      "step": 1311
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.34040480268919593,
+      "learning_rate": 0.00015099407696618982,
+      "loss": 0.6477,
+      "step": 1312
+    },
+    {
+      "epoch": 0.35013333333333335,
+      "grad_norm": 0.3652503847847282,
+      "learning_rate": 0.00015091975418740256,
+      "loss": 0.6686,
+      "step": 1313
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3633498057009687,
+      "learning_rate": 0.00015084539341598036,
+      "loss": 0.6258,
+      "step": 1314
+    },
+    {
+      "epoch": 0.3506666666666667,
+      "grad_norm": 0.3487997690837372,
+      "learning_rate": 0.00015077099470740582,
+      "loss": 0.6158,
+      "step": 1315
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.3756750731507114,
+      "learning_rate": 0.00015069655811718988,
+      "loss": 0.6358,
+      "step": 1316
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.3264627050819527,
+      "learning_rate": 0.00015062208370087178,
+      "loss": 0.5964,
+      "step": 1317
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.3787090171757912,
+      "learning_rate": 0.0001505475715140189,
+      "loss": 0.6419,
+      "step": 1318
+    },
+    {
+      "epoch": 0.35173333333333334,
+      "grad_norm": 0.35407632434998715,
+      "learning_rate": 0.00015047302161222683,
+      "loss": 0.6605,
+      "step": 1319
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.36041955950042137,
+      "learning_rate": 0.0001503984340511193,
+      "loss": 0.6904,
+      "step": 1320
+    },
+    {
+      "epoch": 0.3522666666666667,
+      "grad_norm": 0.3806518021077695,
+      "learning_rate": 0.0001503238088863482,
+      "loss": 0.663,
+      "step": 1321
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.3513632008993957,
+      "learning_rate": 0.00015024914617359342,
+      "loss": 0.6354,
+      "step": 1322
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.3598337632424312,
+      "learning_rate": 0.00015017444596856282,
+      "loss": 0.6522,
+      "step": 1323
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.3626859658184959,
+      "learning_rate": 0.00015009970832699233,
+      "loss": 0.6079,
+      "step": 1324
+    },
+    {
+      "epoch": 0.35333333333333333,
+      "grad_norm": 0.3453728474529043,
+      "learning_rate": 0.0001500249333046458,
+      "loss": 0.6535,
+      "step": 1325
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.3489585036348258,
+      "learning_rate": 0.00014995012095731487,
+      "loss": 0.6851,
+      "step": 1326
+    },
+    {
+      "epoch": 0.35386666666666666,
+      "grad_norm": 0.33436370252875597,
+      "learning_rate": 0.0001498752713408191,
+      "loss": 0.6245,
+      "step": 1327
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.3627241113623393,
+      "learning_rate": 0.0001498003845110059,
+      "loss": 0.6887,
+      "step": 1328
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.33857257502557025,
+      "learning_rate": 0.0001497254605237504,
+      "loss": 0.6079,
+      "step": 1329
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.35886830219258514,
+      "learning_rate": 0.0001496504994349554,
+      "loss": 0.6047,
+      "step": 1330
+    },
+    {
+      "epoch": 0.3549333333333333,
+      "grad_norm": 0.3642384147874268,
+      "learning_rate": 0.0001495755013005515,
+      "loss": 0.6125,
+      "step": 1331
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.39103131647299133,
+      "learning_rate": 0.00014950046617649685,
+      "loss": 0.6871,
+      "step": 1332
+    },
+    {
+      "epoch": 0.35546666666666665,
+      "grad_norm": 0.3586087263192577,
+      "learning_rate": 0.0001494253941187773,
+      "loss": 0.5879,
+      "step": 1333
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.3796636077562965,
+      "learning_rate": 0.00014935028518340602,
+      "loss": 0.6572,
+      "step": 1334
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.3603843501932065,
+      "learning_rate": 0.000149275139426424,
+      "loss": 0.6542,
+      "step": 1335
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.3552313013088614,
+      "learning_rate": 0.00014919995690389958,
+      "loss": 0.6769,
+      "step": 1336
+    },
+    {
+      "epoch": 0.3565333333333333,
+      "grad_norm": 0.3549500257876771,
+      "learning_rate": 0.00014912473767192841,
+      "loss": 0.6444,
+      "step": 1337
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3513257194781214,
+      "learning_rate": 0.00014904948178663373,
+      "loss": 0.6595,
+      "step": 1338
+    },
+    {
+      "epoch": 0.35706666666666664,
+      "grad_norm": 0.3656816659680909,
+      "learning_rate": 0.00014897418930416597,
+      "loss": 0.6681,
+      "step": 1339
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.3437582439392381,
+      "learning_rate": 0.00014889886028070294,
+      "loss": 0.6395,
+      "step": 1340
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.3780791131367785,
+      "learning_rate": 0.00014882349477244976,
+      "loss": 0.6763,
+      "step": 1341
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.3442496514562768,
+      "learning_rate": 0.00014874809283563865,
+      "loss": 0.6277,
+      "step": 1342
+    },
+    {
+      "epoch": 0.35813333333333336,
+      "grad_norm": 0.4031109870907002,
+      "learning_rate": 0.00014867265452652912,
+      "loss": 0.6977,
+      "step": 1343
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3638393122097646,
+      "learning_rate": 0.00014859717990140775,
+      "loss": 0.5819,
+      "step": 1344
+    },
+    {
+      "epoch": 0.3586666666666667,
+      "grad_norm": 0.3675883451529548,
+      "learning_rate": 0.0001485216690165883,
+      "loss": 0.6986,
+      "step": 1345
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.3683718766162737,
+      "learning_rate": 0.00014844612192841143,
+      "loss": 0.6866,
+      "step": 1346
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.3530276908656251,
+      "learning_rate": 0.00014837053869324498,
+      "loss": 0.6869,
+      "step": 1347
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.36759077783164884,
+      "learning_rate": 0.00014829491936748369,
+      "loss": 0.6723,
+      "step": 1348
+    },
+    {
+      "epoch": 0.35973333333333335,
+      "grad_norm": 0.3484944155661589,
+      "learning_rate": 0.00014821926400754916,
+      "loss": 0.6643,
+      "step": 1349
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.36260263794094266,
+      "learning_rate": 0.00014814357266989002,
+      "loss": 0.6403,
+      "step": 1350
+    },
+    {
+      "epoch": 0.3602666666666667,
+      "grad_norm": 0.3575160920381425,
+      "learning_rate": 0.0001480678454109816,
+      "loss": 0.6605,
+      "step": 1351
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.34636289465466796,
+      "learning_rate": 0.0001479920822873262,
+      "loss": 0.6484,
+      "step": 1352
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.34952268449787954,
+      "learning_rate": 0.00014791628335545268,
+      "loss": 0.6018,
+      "step": 1353
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.4499924583766995,
+      "learning_rate": 0.00014784044867191675,
+      "loss": 0.6709,
+      "step": 1354
+    },
+    {
+      "epoch": 0.36133333333333334,
+      "grad_norm": 0.3520854946799651,
+      "learning_rate": 0.00014776457829330077,
+      "loss": 0.6699,
+      "step": 1355
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.38016155586990236,
+      "learning_rate": 0.00014768867227621374,
+      "loss": 0.6637,
+      "step": 1356
+    },
+    {
+      "epoch": 0.36186666666666667,
+      "grad_norm": 0.4071390587483549,
+      "learning_rate": 0.0001476127306772912,
+      "loss": 0.713,
+      "step": 1357
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.3555110502794118,
+      "learning_rate": 0.00014753675355319527,
+      "loss": 0.6458,
+      "step": 1358
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.3832452061683596,
+      "learning_rate": 0.00014746074096061462,
+      "loss": 0.6454,
+      "step": 1359
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.3886559081581442,
+      "learning_rate": 0.00014738469295626433,
+      "loss": 0.6808,
+      "step": 1360
+    },
+    {
+      "epoch": 0.36293333333333333,
+      "grad_norm": 0.34133195946817063,
+      "learning_rate": 0.0001473086095968859,
+      "loss": 0.6096,
+      "step": 1361
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.37643948172272235,
+      "learning_rate": 0.00014723249093924725,
+      "loss": 0.6995,
+      "step": 1362
+    },
+    {
+      "epoch": 0.36346666666666666,
+      "grad_norm": 0.3483812561565921,
+      "learning_rate": 0.0001471563370401426,
+      "loss": 0.6081,
+      "step": 1363
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.3740810360875961,
+      "learning_rate": 0.00014708014795639248,
+      "loss": 0.6489,
+      "step": 1364
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.3520107409932921,
+      "learning_rate": 0.00014700392374484368,
+      "loss": 0.6199,
+      "step": 1365
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.33453889622578503,
+      "learning_rate": 0.00014692766446236914,
+      "loss": 0.6614,
+      "step": 1366
+    },
+    {
+      "epoch": 0.3645333333333333,
+      "grad_norm": 0.3646219251187341,
+      "learning_rate": 0.00014685137016586807,
+      "loss": 0.6086,
+      "step": 1367
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3447941866222518,
+      "learning_rate": 0.00014677504091226574,
+      "loss": 0.6298,
+      "step": 1368
+    },
+    {
+      "epoch": 0.36506666666666665,
+      "grad_norm": 0.3441247576904632,
+      "learning_rate": 0.0001466986767585135,
+      "loss": 0.6138,
+      "step": 1369
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.34802284725086,
+      "learning_rate": 0.00014662227776158877,
+      "loss": 0.6119,
+      "step": 1370
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.3477747911985361,
+      "learning_rate": 0.00014654584397849496,
+      "loss": 0.6699,
+      "step": 1371
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.3495012839218708,
+      "learning_rate": 0.00014646937546626142,
+      "loss": 0.6043,
+      "step": 1372
+    },
+    {
+      "epoch": 0.3661333333333333,
+      "grad_norm": 0.3508876127684906,
+      "learning_rate": 0.0001463928722819434,
+      "loss": 0.6609,
+      "step": 1373
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.34556557992667797,
+      "learning_rate": 0.000146316334482622,
+      "loss": 0.5956,
+      "step": 1374
+    },
+    {
+      "epoch": 0.36666666666666664,
+      "grad_norm": 0.35734406152055626,
+      "learning_rate": 0.00014623976212540428,
+      "loss": 0.6377,
+      "step": 1375
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.37375530548399544,
+      "learning_rate": 0.00014616315526742296,
+      "loss": 0.6843,
+      "step": 1376
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.3747004793008935,
+      "learning_rate": 0.00014608651396583647,
+      "loss": 0.6834,
+      "step": 1377
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.3729794084524959,
+      "learning_rate": 0.0001460098382778291,
+      "loss": 0.6099,
+      "step": 1378
+    },
+    {
+      "epoch": 0.36773333333333336,
+      "grad_norm": 0.35700000020842687,
+      "learning_rate": 0.00014593312826061063,
+      "loss": 0.6236,
+      "step": 1379
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.34834979298561425,
+      "learning_rate": 0.00014585638397141657,
+      "loss": 0.6051,
+      "step": 1380
+    },
+    {
+      "epoch": 0.3682666666666667,
+      "grad_norm": 0.371412487526114,
+      "learning_rate": 0.00014577960546750788,
+      "loss": 0.6251,
+      "step": 1381
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.36117426276351927,
+      "learning_rate": 0.00014570279280617119,
+      "loss": 0.6484,
+      "step": 1382
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.3734292620108866,
+      "learning_rate": 0.0001456259460447185,
+      "loss": 0.6372,
+      "step": 1383
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.3676791781546167,
+      "learning_rate": 0.00014554906524048738,
+      "loss": 0.6541,
+      "step": 1384
+    },
+    {
+      "epoch": 0.36933333333333335,
+      "grad_norm": 0.4150364677606285,
+      "learning_rate": 0.00014547215045084065,
+      "loss": 0.6629,
+      "step": 1385
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.35893859090415847,
+      "learning_rate": 0.00014539520173316653,
+      "loss": 0.6753,
+      "step": 1386
+    },
+    {
+      "epoch": 0.3698666666666667,
+      "grad_norm": 0.399740268406432,
+      "learning_rate": 0.0001453182191448787,
+      "loss": 0.6433,
+      "step": 1387
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.3652602182332252,
+      "learning_rate": 0.0001452412027434159,
+      "loss": 0.6017,
+      "step": 1388
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.360092113518417,
+      "learning_rate": 0.0001451641525862422,
+      "loss": 0.656,
+      "step": 1389
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.3750624238543075,
+      "learning_rate": 0.0001450870687308469,
+      "loss": 0.6633,
+      "step": 1390
+    },
+    {
+      "epoch": 0.37093333333333334,
+      "grad_norm": 0.3737292836860596,
+      "learning_rate": 0.00014500995123474435,
+      "loss": 0.6265,
+      "step": 1391
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.37373396390757424,
+      "learning_rate": 0.00014493280015547407,
+      "loss": 0.6276,
+      "step": 1392
+    },
+    {
+      "epoch": 0.37146666666666667,
+      "grad_norm": 0.34821266552666486,
+      "learning_rate": 0.00014485561555060058,
+      "loss": 0.6462,
+      "step": 1393
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.3805440275795302,
+      "learning_rate": 0.00014477839747771348,
+      "loss": 0.6576,
+      "step": 1394
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.3683730267700658,
+      "learning_rate": 0.00014470114599442728,
+      "loss": 0.6267,
+      "step": 1395
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.366295868060152,
+      "learning_rate": 0.00014462386115838145,
+      "loss": 0.6558,
+      "step": 1396
+    },
+    {
+      "epoch": 0.3725333333333333,
+      "grad_norm": 0.3737742074137285,
+      "learning_rate": 0.00014454654302724034,
+      "loss": 0.6286,
+      "step": 1397
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.38652327012546767,
+      "learning_rate": 0.0001444691916586932,
+      "loss": 0.6399,
+      "step": 1398
+    },
+    {
+      "epoch": 0.37306666666666666,
+      "grad_norm": 0.37861229779727024,
+      "learning_rate": 0.00014439180711045394,
+      "loss": 0.6238,
+      "step": 1399
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.3498803221410021,
+      "learning_rate": 0.00014431438944026133,
+      "loss": 0.6649,
+      "step": 1400
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.37502056890007396,
+      "learning_rate": 0.00014423693870587888,
+      "loss": 0.6662,
+      "step": 1401
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.37778399886076364,
+      "learning_rate": 0.00014415945496509464,
+      "loss": 0.6636,
+      "step": 1402
+    },
+    {
+      "epoch": 0.3741333333333333,
+      "grad_norm": 0.35723546331134165,
+      "learning_rate": 0.00014408193827572142,
+      "loss": 0.6439,
+      "step": 1403
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3613604726543159,
+      "learning_rate": 0.00014400438869559658,
+      "loss": 0.6535,
+      "step": 1404
+    },
+    {
+      "epoch": 0.37466666666666665,
+      "grad_norm": 0.34558749358937024,
+      "learning_rate": 0.000143926806282582,
+      "loss": 0.6533,
+      "step": 1405
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.359771703788518,
+      "learning_rate": 0.000143849191094564,
+      "loss": 0.6538,
+      "step": 1406
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.356332859969955,
+      "learning_rate": 0.0001437715431894535,
+      "loss": 0.644,
+      "step": 1407
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.3333631199903516,
+      "learning_rate": 0.00014369386262518566,
+      "loss": 0.6302,
+      "step": 1408
+    },
+    {
+      "epoch": 0.3757333333333333,
+      "grad_norm": 0.3467244158108116,
+      "learning_rate": 0.00014361614945972018,
+      "loss": 0.6311,
+      "step": 1409
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.3554862473788559,
+      "learning_rate": 0.00014353840375104092,
+      "loss": 0.6689,
+      "step": 1410
+    },
+    {
+      "epoch": 0.3762666666666667,
+      "grad_norm": 0.36410287380349493,
+      "learning_rate": 0.0001434606255571562,
+      "loss": 0.6239,
+      "step": 1411
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.3539060822945138,
+      "learning_rate": 0.00014338281493609834,
+      "loss": 0.66,
+      "step": 1412
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.37360535660889377,
+      "learning_rate": 0.00014330497194592408,
+      "loss": 0.6479,
+      "step": 1413
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.3692522817580998,
+      "learning_rate": 0.00014322709664471423,
+      "loss": 0.6611,
+      "step": 1414
+    },
+    {
+      "epoch": 0.37733333333333335,
+      "grad_norm": 0.3726703888369673,
+      "learning_rate": 0.0001431491890905737,
+      "loss": 0.6745,
+      "step": 1415
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.33577272835165656,
+      "learning_rate": 0.00014307124934163148,
+      "loss": 0.676,
+      "step": 1416
+    },
+    {
+      "epoch": 0.3778666666666667,
+      "grad_norm": 0.368907912077969,
+      "learning_rate": 0.0001429932774560405,
+      "loss": 0.6885,
+      "step": 1417
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.3506804423496715,
+      "learning_rate": 0.00014291527349197779,
+      "loss": 0.6204,
+      "step": 1418
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.34909536845745265,
+      "learning_rate": 0.0001428372375076443,
+      "loss": 0.6724,
+      "step": 1419
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.35650561973341643,
+      "learning_rate": 0.00014275916956126475,
+      "loss": 0.6064,
+      "step": 1420
+    },
+    {
+      "epoch": 0.37893333333333334,
+      "grad_norm": 0.3457895485290377,
+      "learning_rate": 0.0001426810697110878,
+      "loss": 0.6617,
+      "step": 1421
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.345104340905149,
+      "learning_rate": 0.000142602938015386,
+      "loss": 0.6187,
+      "step": 1422
+    },
+    {
+      "epoch": 0.3794666666666667,
+      "grad_norm": 0.37404714389064475,
+      "learning_rate": 0.00014252477453245544,
+      "loss": 0.6972,
+      "step": 1423
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.3767845170610133,
+      "learning_rate": 0.00014244657932061615,
+      "loss": 0.6624,
+      "step": 1424
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.3747093185175365,
+      "learning_rate": 0.00014236835243821167,
+      "loss": 0.6034,
+      "step": 1425
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.34457907595866566,
+      "learning_rate": 0.0001422900939436093,
+      "loss": 0.6059,
+      "step": 1426
+    },
+    {
+      "epoch": 0.38053333333333333,
+      "grad_norm": 0.34566087020841413,
+      "learning_rate": 0.00014221180389519984,
+      "loss": 0.5967,
+      "step": 1427
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3651092498941689,
+      "learning_rate": 0.0001421334823513976,
+      "loss": 0.6375,
+      "step": 1428
+    },
+    {
+      "epoch": 0.38106666666666666,
+      "grad_norm": 0.37221216779026145,
+      "learning_rate": 0.00014205512937064054,
+      "loss": 0.6975,
+      "step": 1429
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.3379472287858967,
+      "learning_rate": 0.00014197674501138994,
+      "loss": 0.6518,
+      "step": 1430
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.3638046101805259,
+      "learning_rate": 0.0001418983293321305,
+      "loss": 0.6862,
+      "step": 1431
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.34576413288436314,
+      "learning_rate": 0.00014181988239137037,
+      "loss": 0.6079,
+      "step": 1432
+    },
+    {
+      "epoch": 0.3821333333333333,
+      "grad_norm": 0.35071377304977486,
+      "learning_rate": 0.000141741404247641,
+      "loss": 0.6351,
+      "step": 1433
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.35840835151802136,
+      "learning_rate": 0.00014166289495949705,
+      "loss": 0.6665,
+      "step": 1434
+    },
+    {
+      "epoch": 0.38266666666666665,
+      "grad_norm": 0.35088984607529805,
+      "learning_rate": 0.00014158435458551649,
+      "loss": 0.6775,
+      "step": 1435
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.33604592310546416,
+      "learning_rate": 0.00014150578318430042,
+      "loss": 0.6133,
+      "step": 1436
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.34857723776658783,
+      "learning_rate": 0.00014142718081447324,
+      "loss": 0.6314,
+      "step": 1437
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.3663141090467801,
+      "learning_rate": 0.00014134854753468224,
+      "loss": 0.6546,
+      "step": 1438
+    },
+    {
+      "epoch": 0.3837333333333333,
+      "grad_norm": 0.37174836362339136,
+      "learning_rate": 0.00014126988340359796,
+      "loss": 0.6718,
+      "step": 1439
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3385548087790671,
+      "learning_rate": 0.0001411911884799138,
+      "loss": 0.6074,
+      "step": 1440
+    },
+    {
+      "epoch": 0.38426666666666665,
+      "grad_norm": 0.33816388270202485,
+      "learning_rate": 0.00014111246282234624,
+      "loss": 0.6282,
+      "step": 1441
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.374631723416128,
+      "learning_rate": 0.00014103370648963474,
+      "loss": 0.695,
+      "step": 1442
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.3778845126887572,
+      "learning_rate": 0.0001409549195405415,
+      "loss": 0.6493,
+      "step": 1443
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.35557762969413714,
+      "learning_rate": 0.00014087610203385168,
+      "loss": 0.6004,
+      "step": 1444
+    },
+    {
+      "epoch": 0.38533333333333336,
+      "grad_norm": 0.3283441174659094,
+      "learning_rate": 0.00014079725402837314,
+      "loss": 0.6511,
+      "step": 1445
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.35073668316204804,
+      "learning_rate": 0.00014071837558293662,
+      "loss": 0.648,
+      "step": 1446
+    },
+    {
+      "epoch": 0.3858666666666667,
+      "grad_norm": 0.3523652123712978,
+      "learning_rate": 0.0001406394667563955,
+      "loss": 0.6253,
+      "step": 1447
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.3773427908918206,
+      "learning_rate": 0.00014056052760762577,
+      "loss": 0.6451,
+      "step": 1448
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.3810824327305423,
+      "learning_rate": 0.00014048155819552618,
+      "loss": 0.6456,
+      "step": 1449
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.35006470475505896,
+      "learning_rate": 0.00014040255857901798,
+      "loss": 0.6134,
+      "step": 1450
+    },
+    {
+      "epoch": 0.38693333333333335,
+      "grad_norm": 0.3734970644646683,
+      "learning_rate": 0.0001403235288170449,
+      "loss": 0.6322,
+      "step": 1451
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3560229088880377,
+      "learning_rate": 0.0001402444689685733,
+      "loss": 0.6162,
+      "step": 1452
+    },
+    {
+      "epoch": 0.3874666666666667,
+      "grad_norm": 0.34700815733093937,
+      "learning_rate": 0.0001401653790925919,
+      "loss": 0.6342,
+      "step": 1453
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.35302883638128907,
+      "learning_rate": 0.00014008625924811184,
+      "loss": 0.697,
+      "step": 1454
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.33782945603236564,
+      "learning_rate": 0.00014000710949416663,
+      "loss": 0.588,
+      "step": 1455
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.3580034949409188,
+      "learning_rate": 0.00013992792988981205,
+      "loss": 0.6369,
+      "step": 1456
+    },
+    {
+      "epoch": 0.38853333333333334,
+      "grad_norm": 0.3532360774481146,
+      "learning_rate": 0.00013984872049412623,
+      "loss": 0.6432,
+      "step": 1457
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.3862133257877399,
+      "learning_rate": 0.00013976948136620946,
+      "loss": 0.6698,
+      "step": 1458
+    },
+    {
+      "epoch": 0.38906666666666667,
+      "grad_norm": 0.3754654686788404,
+      "learning_rate": 0.00013969021256518424,
+      "loss": 0.7035,
+      "step": 1459
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.3443690469182567,
+      "learning_rate": 0.00013961091415019524,
+      "loss": 0.6213,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.33456931578760196,
+      "learning_rate": 0.00013953158618040917,
+      "loss": 0.619,
+      "step": 1461
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.3773389803746547,
+      "learning_rate": 0.00013945222871501487,
+      "loss": 0.6321,
+      "step": 1462
+    },
+    {
+      "epoch": 0.39013333333333333,
+      "grad_norm": 0.3684761345026691,
+      "learning_rate": 0.00013937284181322307,
+      "loss": 0.6618,
+      "step": 1463
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3584326632493809,
+      "learning_rate": 0.00013929342553426657,
+      "loss": 0.6411,
+      "step": 1464
+    },
+    {
+      "epoch": 0.39066666666666666,
+      "grad_norm": 0.32540465460956763,
+      "learning_rate": 0.0001392139799374,
+      "loss": 0.6293,
+      "step": 1465
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.33652689128674673,
+      "learning_rate": 0.0001391345050819,
+      "loss": 0.6018,
+      "step": 1466
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.3694480872477037,
+      "learning_rate": 0.0001390550010270649,
+      "loss": 0.5906,
+      "step": 1467
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.3594553565703397,
+      "learning_rate": 0.00013897546783221484,
+      "loss": 0.6488,
+      "step": 1468
+    },
+    {
+      "epoch": 0.3917333333333333,
+      "grad_norm": 0.32334944748128114,
+      "learning_rate": 0.0001388959055566918,
+      "loss": 0.6093,
+      "step": 1469
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.35605816828535247,
+      "learning_rate": 0.00013881631425985934,
+      "loss": 0.6403,
+      "step": 1470
+    },
+    {
+      "epoch": 0.39226666666666665,
+      "grad_norm": 0.3419202024337364,
+      "learning_rate": 0.00013873669400110277,
+      "loss": 0.622,
+      "step": 1471
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.36731323388180237,
+      "learning_rate": 0.00013865704483982894,
+      "loss": 0.6626,
+      "step": 1472
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.3636856419821721,
+      "learning_rate": 0.0001385773668354663,
+      "loss": 0.6372,
+      "step": 1473
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.338388744430327,
+      "learning_rate": 0.00013849766004746475,
+      "loss": 0.6249,
+      "step": 1474
+    },
+    {
+      "epoch": 0.3933333333333333,
+      "grad_norm": 0.3678788088288983,
+      "learning_rate": 0.00013841792453529581,
+      "loss": 0.6047,
+      "step": 1475
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3663416926907222,
+      "learning_rate": 0.00013833816035845232,
+      "loss": 0.6465,
+      "step": 1476
+    },
+    {
+      "epoch": 0.39386666666666664,
+      "grad_norm": 0.3578041769595127,
+      "learning_rate": 0.00013825836757644852,
+      "loss": 0.6409,
+      "step": 1477
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.3563270113134819,
+      "learning_rate": 0.00013817854624882,
+      "loss": 0.6554,
+      "step": 1478
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.34830123898977755,
+      "learning_rate": 0.00013809869643512367,
+      "loss": 0.6636,
+      "step": 1479
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.34219361898329487,
+      "learning_rate": 0.00013801881819493772,
+      "loss": 0.6484,
+      "step": 1480
+    },
+    {
+      "epoch": 0.39493333333333336,
+      "grad_norm": 0.35243131988303616,
+      "learning_rate": 0.00013793891158786148,
+      "loss": 0.5953,
+      "step": 1481
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.3519520099892265,
+      "learning_rate": 0.00013785897667351543,
+      "loss": 0.6255,
+      "step": 1482
+    },
+    {
+      "epoch": 0.3954666666666667,
+      "grad_norm": 0.3436903877252368,
+      "learning_rate": 0.0001377790135115413,
+      "loss": 0.6293,
+      "step": 1483
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.3377837437362064,
+      "learning_rate": 0.00013769902216160176,
+      "loss": 0.6053,
+      "step": 1484
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.3711738208922456,
+      "learning_rate": 0.0001376190026833806,
+      "loss": 0.6375,
+      "step": 1485
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.340851902526855,
+      "learning_rate": 0.0001375389551365825,
+      "loss": 0.6469,
+      "step": 1486
+    },
+    {
+      "epoch": 0.39653333333333335,
+      "grad_norm": 0.33360286197581673,
+      "learning_rate": 0.0001374588795809332,
+      "loss": 0.6191,
+      "step": 1487
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.36348833255983914,
+      "learning_rate": 0.00013737877607617927,
+      "loss": 0.6632,
+      "step": 1488
+    },
+    {
+      "epoch": 0.3970666666666667,
+      "grad_norm": 0.34114683366302295,
+      "learning_rate": 0.00013729864468208818,
+      "loss": 0.6323,
+      "step": 1489
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.36866921044462253,
+      "learning_rate": 0.0001372184854584481,
+      "loss": 0.6287,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.34049810542946163,
+      "learning_rate": 0.00013713829846506812,
+      "loss": 0.6602,
+      "step": 1491
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.3407484177480634,
+      "learning_rate": 0.0001370580837617779,
+      "loss": 0.6597,
+      "step": 1492
+    },
+    {
+      "epoch": 0.39813333333333334,
+      "grad_norm": 0.37710184555590415,
+      "learning_rate": 0.00013697784140842794,
+      "loss": 0.6393,
+      "step": 1493
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.362940222353952,
+      "learning_rate": 0.00013689757146488916,
+      "loss": 0.6197,
+      "step": 1494
+    },
+    {
+      "epoch": 0.39866666666666667,
+      "grad_norm": 0.34600692014793993,
+      "learning_rate": 0.00013681727399105328,
+      "loss": 0.6153,
+      "step": 1495
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.367833759542811,
+      "learning_rate": 0.0001367369490468324,
+      "loss": 0.6673,
+      "step": 1496
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.4110824434740751,
+      "learning_rate": 0.0001366565966921592,
+      "loss": 0.6873,
+      "step": 1497
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.351806620600648,
+      "learning_rate": 0.0001365762169869868,
+      "loss": 0.6578,
+      "step": 1498
+    },
+    {
+      "epoch": 0.39973333333333333,
+      "grad_norm": 0.3369270496471882,
+      "learning_rate": 0.0001364958099912887,
+      "loss": 0.5992,
+      "step": 1499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.34896761237819673,
+      "learning_rate": 0.0001364153757650588,
+      "loss": 0.6477,
+      "step": 1500
+    },
+    {
+      "epoch": 0.40026666666666666,
+      "grad_norm": 0.34845533390700606,
+      "learning_rate": 0.00013633491436831132,
+      "loss": 0.6206,
+      "step": 1501
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.3363710981780535,
+      "learning_rate": 0.00013625442586108065,
+      "loss": 0.6371,
+      "step": 1502
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.35188392032956,
+      "learning_rate": 0.00013617391030342158,
+      "loss": 0.6507,
+      "step": 1503
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.33431656452883624,
+      "learning_rate": 0.00013609336775540892,
+      "loss": 0.6344,
+      "step": 1504
+    },
+    {
+      "epoch": 0.4013333333333333,
+      "grad_norm": 0.3653488328515536,
+      "learning_rate": 0.00013601279827713772,
+      "loss": 0.6127,
+      "step": 1505
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.35589419078074647,
+      "learning_rate": 0.00013593220192872308,
+      "loss": 0.6474,
+      "step": 1506
+    },
+    {
+      "epoch": 0.40186666666666665,
+      "grad_norm": 0.35186548872761786,
+      "learning_rate": 0.0001358515787703002,
+      "loss": 0.66,
+      "step": 1507
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.3714718982362881,
+      "learning_rate": 0.00013577092886202417,
+      "loss": 0.6468,
+      "step": 1508
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.3453223638264368,
+      "learning_rate": 0.00013569025226407023,
+      "loss": 0.6594,
+      "step": 1509
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.35309142222904594,
+      "learning_rate": 0.00013560954903663332,
+      "loss": 0.6915,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4029333333333333,
+      "grad_norm": 0.37889247533151243,
+      "learning_rate": 0.00013552881923992839,
+      "loss": 0.6559,
+      "step": 1511
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.34838037050351023,
+      "learning_rate": 0.00013544806293419015,
+      "loss": 0.6356,
+      "step": 1512
+    },
+    {
+      "epoch": 0.40346666666666664,
+      "grad_norm": 0.3880616920772269,
+      "learning_rate": 0.00013536728017967312,
+      "loss": 0.6794,
+      "step": 1513
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.38773194173789227,
+      "learning_rate": 0.00013528647103665148,
+      "loss": 0.656,
+      "step": 1514
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.33382700773093865,
+      "learning_rate": 0.0001352056355654193,
+      "loss": 0.6348,
+      "step": 1515
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.34673375611082313,
+      "learning_rate": 0.00013512477382629008,
+      "loss": 0.6744,
+      "step": 1516
+    },
+    {
+      "epoch": 0.40453333333333336,
+      "grad_norm": 0.37090956322641655,
+      "learning_rate": 0.00013504388587959695,
+      "loss": 0.6409,
+      "step": 1517
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.35147367112397365,
+      "learning_rate": 0.00013496297178569274,
+      "loss": 0.6627,
+      "step": 1518
+    },
+    {
+      "epoch": 0.4050666666666667,
+      "grad_norm": 0.36441317582458954,
+      "learning_rate": 0.00013488203160494963,
+      "loss": 0.5897,
+      "step": 1519
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.3424548610752302,
+      "learning_rate": 0.00013480106539775935,
+      "loss": 0.6124,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.3646191358554874,
+      "learning_rate": 0.00013472007322453297,
+      "loss": 0.6433,
+      "step": 1521
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.3574308271935125,
+      "learning_rate": 0.00013463905514570106,
+      "loss": 0.635,
+      "step": 1522
+    },
+    {
+      "epoch": 0.40613333333333335,
+      "grad_norm": 0.38219758508015894,
+      "learning_rate": 0.0001345580112217134,
+      "loss": 0.6675,
+      "step": 1523
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3467066500315951,
+      "learning_rate": 0.0001344769415130391,
+      "loss": 0.6376,
+      "step": 1524
+    },
+    {
+      "epoch": 0.4066666666666667,
+      "grad_norm": 0.3461541173882036,
+      "learning_rate": 0.00013439584608016653,
+      "loss": 0.6377,
+      "step": 1525
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.3499686466449366,
+      "learning_rate": 0.00013431472498360325,
+      "loss": 0.6604,
+      "step": 1526
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.34119690818259274,
+      "learning_rate": 0.00013423357828387588,
+      "loss": 0.6021,
+      "step": 1527
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.3929815220550607,
+      "learning_rate": 0.0001341524060415303,
+      "loss": 0.6554,
+      "step": 1528
+    },
+    {
+      "epoch": 0.40773333333333334,
+      "grad_norm": 0.34457738716127345,
+      "learning_rate": 0.0001340712083171313,
+      "loss": 0.616,
+      "step": 1529
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.37599960302688823,
+      "learning_rate": 0.00013398998517126276,
+      "loss": 0.6922,
+      "step": 1530
+    },
+    {
+      "epoch": 0.40826666666666667,
+      "grad_norm": 0.39660022941118506,
+      "learning_rate": 0.00013390873666452752,
+      "loss": 0.6033,
+      "step": 1531
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.3605708887523207,
+      "learning_rate": 0.00013382746285754734,
+      "loss": 0.6783,
+      "step": 1532
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.3667495928710032,
+      "learning_rate": 0.00013374616381096286,
+      "loss": 0.7042,
+      "step": 1533
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.3551671483277212,
+      "learning_rate": 0.0001336648395854335,
+      "loss": 0.655,
+      "step": 1534
+    },
+    {
+      "epoch": 0.4093333333333333,
+      "grad_norm": 0.35594835772839595,
+      "learning_rate": 0.00013358349024163754,
+      "loss": 0.6357,
+      "step": 1535
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.36026625455622185,
+      "learning_rate": 0.000133502115840272,
+      "loss": 0.6777,
+      "step": 1536
+    },
+    {
+      "epoch": 0.40986666666666666,
+      "grad_norm": 0.3586308590497421,
+      "learning_rate": 0.00013342071644205253,
+      "loss": 0.6751,
+      "step": 1537
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.3560120889098755,
+      "learning_rate": 0.00013333929210771346,
+      "loss": 0.6445,
+      "step": 1538
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.35177785685708396,
+      "learning_rate": 0.00013325784289800775,
+      "loss": 0.6036,
+      "step": 1539
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.36247147861875423,
+      "learning_rate": 0.00013317636887370696,
+      "loss": 0.655,
+      "step": 1540
+    },
+    {
+      "epoch": 0.4109333333333333,
+      "grad_norm": 0.380556751452829,
+      "learning_rate": 0.000133094870095601,
+      "loss": 0.6579,
+      "step": 1541
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4100655898876012,
+      "learning_rate": 0.0001330133466244984,
+      "loss": 0.6818,
+      "step": 1542
+    },
+    {
+      "epoch": 0.41146666666666665,
+      "grad_norm": 0.36703019103232193,
+      "learning_rate": 0.00013293179852122612,
+      "loss": 0.6771,
+      "step": 1543
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.357059947248057,
+      "learning_rate": 0.00013285022584662946,
+      "loss": 0.6354,
+      "step": 1544
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.3497474206223572,
+      "learning_rate": 0.00013276862866157198,
+      "loss": 0.6323,
+      "step": 1545
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.3529645072569777,
+      "learning_rate": 0.0001326870070269356,
+      "loss": 0.659,
+      "step": 1546
+    },
+    {
+      "epoch": 0.4125333333333333,
+      "grad_norm": 0.3436442258301328,
+      "learning_rate": 0.00013260536100362055,
+      "loss": 0.6602,
+      "step": 1547
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3590771813513007,
+      "learning_rate": 0.00013252369065254511,
+      "loss": 0.6302,
+      "step": 1548
+    },
+    {
+      "epoch": 0.4130666666666667,
+      "grad_norm": 0.3559947854838785,
+      "learning_rate": 0.0001324419960346458,
+      "loss": 0.6378,
+      "step": 1549
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.4094994463533802,
+      "learning_rate": 0.00013236027721087723,
+      "loss": 0.603,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.35900635506623746,
+      "learning_rate": 0.00013227853424221207,
+      "loss": 0.6815,
+      "step": 1551
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.3494160339869504,
+      "learning_rate": 0.00013219676718964103,
+      "loss": 0.6624,
+      "step": 1552
+    },
+    {
+      "epoch": 0.41413333333333335,
+      "grad_norm": 0.340149423466505,
+      "learning_rate": 0.00013211497611417272,
+      "loss": 0.635,
+      "step": 1553
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.35777327817154914,
+      "learning_rate": 0.00013203316107683377,
+      "loss": 0.6357,
+      "step": 1554
+    },
+    {
+      "epoch": 0.4146666666666667,
+      "grad_norm": 0.35128954100735255,
+      "learning_rate": 0.00013195132213866866,
+      "loss": 0.6753,
+      "step": 1555
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.32580586190412875,
+      "learning_rate": 0.0001318694593607396,
+      "loss": 0.6438,
+      "step": 1556
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.345562090642469,
+      "learning_rate": 0.0001317875728041267,
+      "loss": 0.6259,
+      "step": 1557
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.34544027345472283,
+      "learning_rate": 0.0001317056625299278,
+      "loss": 0.6492,
+      "step": 1558
+    },
+    {
+      "epoch": 0.41573333333333334,
+      "grad_norm": 0.367357007877592,
+      "learning_rate": 0.00013162372859925844,
+      "loss": 0.6305,
+      "step": 1559
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3432714506183897,
+      "learning_rate": 0.00013154177107325174,
+      "loss": 0.6039,
+      "step": 1560
+    },
+    {
+      "epoch": 0.4162666666666667,
+      "grad_norm": 0.3495804273624954,
+      "learning_rate": 0.00013145979001305847,
+      "loss": 0.6296,
+      "step": 1561
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.3574362943924044,
+      "learning_rate": 0.00013137778547984703,
+      "loss": 0.6415,
+      "step": 1562
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.3637672470687809,
+      "learning_rate": 0.00013129575753480322,
+      "loss": 0.6242,
+      "step": 1563
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.3696659262777461,
+      "learning_rate": 0.0001312137062391303,
+      "loss": 0.692,
+      "step": 1564
+    },
+    {
+      "epoch": 0.41733333333333333,
+      "grad_norm": 0.3554003912618743,
+      "learning_rate": 0.00013113163165404915,
+      "loss": 0.6368,
+      "step": 1565
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.38875111923163774,
+      "learning_rate": 0.0001310495338407977,
+      "loss": 0.704,
+      "step": 1566
+    },
+    {
+      "epoch": 0.41786666666666666,
+      "grad_norm": 0.3445955597250309,
+      "learning_rate": 0.00013096741286063162,
+      "loss": 0.6099,
+      "step": 1567
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.3637008520152639,
+      "learning_rate": 0.00013088526877482343,
+      "loss": 0.6593,
+      "step": 1568
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.35403197489850313,
+      "learning_rate": 0.0001308031016446632,
+      "loss": 0.6517,
+      "step": 1569
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.36730966870295284,
+      "learning_rate": 0.00013072091153145808,
+      "loss": 0.6431,
+      "step": 1570
+    },
+    {
+      "epoch": 0.4189333333333333,
+      "grad_norm": 0.35365863428943584,
+      "learning_rate": 0.00013063869849653243,
+      "loss": 0.6652,
+      "step": 1571
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3570860852158583,
+      "learning_rate": 0.00013055646260122763,
+      "loss": 0.6182,
+      "step": 1572
+    },
+    {
+      "epoch": 0.41946666666666665,
+      "grad_norm": 0.38879671533863996,
+      "learning_rate": 0.0001304742039069021,
+      "loss": 0.6896,
+      "step": 1573
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.43243773122660395,
+      "learning_rate": 0.0001303919224749314,
+      "loss": 0.68,
+      "step": 1574
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.35840297015870565,
+      "learning_rate": 0.00013030961836670794,
+      "loss": 0.6245,
+      "step": 1575
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.36316311253279926,
+      "learning_rate": 0.00013022729164364108,
+      "loss": 0.6142,
+      "step": 1576
+    },
+    {
+      "epoch": 0.4205333333333333,
+      "grad_norm": 0.351754921269816,
+      "learning_rate": 0.0001301449423671571,
+      "loss": 0.6003,
+      "step": 1577
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.37410580813936667,
+      "learning_rate": 0.00013006257059869906,
+      "loss": 0.6186,
+      "step": 1578
+    },
+    {
+      "epoch": 0.42106666666666664,
+      "grad_norm": 0.35361578526631765,
+      "learning_rate": 0.00012998017639972677,
+      "loss": 0.615,
+      "step": 1579
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.36374019695876725,
+      "learning_rate": 0.00012989775983171688,
+      "loss": 0.5908,
+      "step": 1580
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.3743848873800401,
+      "learning_rate": 0.0001298153209561626,
+      "loss": 0.624,
+      "step": 1581
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.3618039358338652,
+      "learning_rate": 0.00012973285983457393,
+      "loss": 0.6252,
+      "step": 1582
+    },
+    {
+      "epoch": 0.42213333333333336,
+      "grad_norm": 0.36607735560456356,
+      "learning_rate": 0.00012965037652847732,
+      "loss": 0.6422,
+      "step": 1583
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3412080100607846,
+      "learning_rate": 0.0001295678710994159,
+      "loss": 0.6139,
+      "step": 1584
+    },
+    {
+      "epoch": 0.4226666666666667,
+      "grad_norm": 0.34046334058310873,
+      "learning_rate": 0.0001294853436089492,
+      "loss": 0.6726,
+      "step": 1585
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.3773645149144639,
+      "learning_rate": 0.00012940279411865327,
+      "loss": 0.6675,
+      "step": 1586
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.33817104462326586,
+      "learning_rate": 0.0001293202226901206,
+      "loss": 0.593,
+      "step": 1587
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.37190404574939634,
+      "learning_rate": 0.00012923762938495996,
+      "loss": 0.6512,
+      "step": 1588
+    },
+    {
+      "epoch": 0.42373333333333335,
+      "grad_norm": 0.3486381712691507,
+      "learning_rate": 0.00012915501426479656,
+      "loss": 0.6603,
+      "step": 1589
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.34465044432332115,
+      "learning_rate": 0.00012907237739127173,
+      "loss": 0.5976,
+      "step": 1590
+    },
+    {
+      "epoch": 0.4242666666666667,
+      "grad_norm": 0.35627556293759993,
+      "learning_rate": 0.00012898971882604324,
+      "loss": 0.6258,
+      "step": 1591
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.41447946758644905,
+      "learning_rate": 0.00012890703863078487,
+      "loss": 0.6143,
+      "step": 1592
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.3658966979109238,
+      "learning_rate": 0.00012882433686718656,
+      "loss": 0.6204,
+      "step": 1593
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.3663147593448287,
+      "learning_rate": 0.00012874161359695445,
+      "loss": 0.6471,
+      "step": 1594
+    },
+    {
+      "epoch": 0.42533333333333334,
+      "grad_norm": 0.3574959345862566,
+      "learning_rate": 0.00012865886888181058,
+      "loss": 0.6515,
+      "step": 1595
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3333632529687575,
+      "learning_rate": 0.00012857610278349315,
+      "loss": 0.6089,
+      "step": 1596
+    },
+    {
+      "epoch": 0.42586666666666667,
+      "grad_norm": 0.3622629955492807,
+      "learning_rate": 0.00012849331536375614,
+      "loss": 0.6408,
+      "step": 1597
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.35328265411820614,
+      "learning_rate": 0.00012841050668436964,
+      "loss": 0.6231,
+      "step": 1598
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.34575833118252797,
+      "learning_rate": 0.0001283276768071194,
+      "loss": 0.6461,
+      "step": 1599
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.3394298671827499,
+      "learning_rate": 0.00012824482579380716,
+      "loss": 0.6006,
+      "step": 1600
+    },
+    {
+      "epoch": 0.42693333333333333,
+      "grad_norm": 0.38092917381766306,
+      "learning_rate": 0.00012816195370625027,
+      "loss": 0.6414,
+      "step": 1601
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4090247505789534,
+      "learning_rate": 0.00012807906060628192,
+      "loss": 0.676,
+      "step": 1602
+    },
+    {
+      "epoch": 0.42746666666666666,
+      "grad_norm": 0.35519065725042875,
+      "learning_rate": 0.00012799614655575095,
+      "loss": 0.6573,
+      "step": 1603
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.3660425425972097,
+      "learning_rate": 0.00012791321161652178,
+      "loss": 0.6237,
+      "step": 1604
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.38797207014166457,
+      "learning_rate": 0.00012783025585047452,
+      "loss": 0.6489,
+      "step": 1605
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.39178472300057515,
+      "learning_rate": 0.00012774727931950472,
+      "loss": 0.6582,
+      "step": 1606
+    },
+    {
+      "epoch": 0.4285333333333333,
+      "grad_norm": 0.3548602555156648,
+      "learning_rate": 0.00012766428208552347,
+      "loss": 0.6397,
+      "step": 1607
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.41160462341490733,
+      "learning_rate": 0.0001275812642104573,
+      "loss": 0.6398,
+      "step": 1608
+    },
+    {
+      "epoch": 0.42906666666666665,
+      "grad_norm": 0.3607476132808952,
+      "learning_rate": 0.00012749822575624812,
+      "loss": 0.6268,
+      "step": 1609
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.3813303178413586,
+      "learning_rate": 0.0001274151667848533,
+      "loss": 0.6844,
+      "step": 1610
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.35958579899889676,
+      "learning_rate": 0.00012733208735824528,
+      "loss": 0.6903,
+      "step": 1611
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.36347371465089556,
+      "learning_rate": 0.00012724898753841205,
+      "loss": 0.6213,
+      "step": 1612
+    },
+    {
+      "epoch": 0.4301333333333333,
+      "grad_norm": 0.35423954632673743,
+      "learning_rate": 0.0001271658673873566,
+      "loss": 0.6185,
+      "step": 1613
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.34804941527345973,
+      "learning_rate": 0.0001270827269670972,
+      "loss": 0.6338,
+      "step": 1614
+    },
+    {
+      "epoch": 0.43066666666666664,
+      "grad_norm": 0.35797360802324796,
+      "learning_rate": 0.00012699956633966726,
+      "loss": 0.6683,
+      "step": 1615
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.3422332310143983,
+      "learning_rate": 0.00012691638556711513,
+      "loss": 0.614,
+      "step": 1616
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.34984801626919965,
+      "learning_rate": 0.00012683318471150434,
+      "loss": 0.665,
+      "step": 1617
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.37070912012570234,
+      "learning_rate": 0.00012674996383491336,
+      "loss": 0.6208,
+      "step": 1618
+    },
+    {
+      "epoch": 0.43173333333333336,
+      "grad_norm": 0.3562232026068743,
+      "learning_rate": 0.00012666672299943552,
+      "loss": 0.6212,
+      "step": 1619
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.36034379188903254,
+      "learning_rate": 0.00012658346226717917,
+      "loss": 0.6134,
+      "step": 1620
+    },
+    {
+      "epoch": 0.4322666666666667,
+      "grad_norm": 0.3852555846814048,
+      "learning_rate": 0.0001265001817002674,
+      "loss": 0.6458,
+      "step": 1621
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.39639876575903304,
+      "learning_rate": 0.00012641688136083817,
+      "loss": 0.6625,
+      "step": 1622
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.3584288109644875,
+      "learning_rate": 0.00012633356131104415,
+      "loss": 0.6198,
+      "step": 1623
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.3764514354130069,
+      "learning_rate": 0.00012625022161305273,
+      "loss": 0.5889,
+      "step": 1624
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 0.3683177623299117,
+      "learning_rate": 0.00012616686232904594,
+      "loss": 0.6223,
+      "step": 1625
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.38144315307484467,
+      "learning_rate": 0.0001260834835212205,
+      "loss": 0.6279,
+      "step": 1626
+    },
+    {
+      "epoch": 0.4338666666666667,
+      "grad_norm": 0.36431404847538273,
+      "learning_rate": 0.00012600008525178756,
+      "loss": 0.6178,
+      "step": 1627
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.40141052098830127,
+      "learning_rate": 0.00012591666758297296,
+      "loss": 0.6678,
+      "step": 1628
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.3991071901941075,
+      "learning_rate": 0.00012583323057701687,
+      "loss": 0.7102,
+      "step": 1629
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.35572824812524084,
+      "learning_rate": 0.000125749774296174,
+      "loss": 0.6251,
+      "step": 1630
+    },
+    {
+      "epoch": 0.43493333333333334,
+      "grad_norm": 0.39366789227153004,
+      "learning_rate": 0.0001256662988027133,
+      "loss": 0.6191,
+      "step": 1631
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3403861815415497,
+      "learning_rate": 0.0001255828041589182,
+      "loss": 0.642,
+      "step": 1632
+    },
+    {
+      "epoch": 0.43546666666666667,
+      "grad_norm": 0.37721966964369996,
+      "learning_rate": 0.00012549929042708638,
+      "loss": 0.6162,
+      "step": 1633
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.3639172123188235,
+      "learning_rate": 0.00012541575766952966,
+      "loss": 0.5947,
+      "step": 1634
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.358168330792411,
+      "learning_rate": 0.0001253322059485742,
+      "loss": 0.588,
+      "step": 1635
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.37188529578213403,
+      "learning_rate": 0.00012524863532656025,
+      "loss": 0.6287,
+      "step": 1636
+    },
+    {
+      "epoch": 0.43653333333333333,
+      "grad_norm": 0.3719258571402577,
+      "learning_rate": 0.00012516504586584216,
+      "loss": 0.6721,
+      "step": 1637
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3781674849224295,
+      "learning_rate": 0.00012508143762878827,
+      "loss": 0.6446,
+      "step": 1638
+    },
+    {
+      "epoch": 0.43706666666666666,
+      "grad_norm": 0.3601080826388201,
+      "learning_rate": 0.00012499781067778107,
+      "loss": 0.7061,
+      "step": 1639
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.37014558370817746,
+      "learning_rate": 0.00012491416507521693,
+      "loss": 0.6267,
+      "step": 1640
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.3305540827558495,
+      "learning_rate": 0.0001248305008835061,
+      "loss": 0.5939,
+      "step": 1641
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.366051769668418,
+      "learning_rate": 0.00012474681816507273,
+      "loss": 0.6585,
+      "step": 1642
+    },
+    {
+      "epoch": 0.4381333333333333,
+      "grad_norm": 0.335316699243076,
+      "learning_rate": 0.0001246631169823549,
+      "loss": 0.6076,
+      "step": 1643
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3709645424099733,
+      "learning_rate": 0.00012457939739780432,
+      "loss": 0.6129,
+      "step": 1644
+    },
+    {
+      "epoch": 0.43866666666666665,
+      "grad_norm": 0.34149193135982836,
+      "learning_rate": 0.00012449565947388652,
+      "loss": 0.5817,
+      "step": 1645
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.3508505022569726,
+      "learning_rate": 0.00012441190327308057,
+      "loss": 0.6479,
+      "step": 1646
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.3611783217736963,
+      "learning_rate": 0.00012432812885787938,
+      "loss": 0.5945,
+      "step": 1647
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.3485188200809396,
+      "learning_rate": 0.00012424433629078935,
+      "loss": 0.6976,
+      "step": 1648
+    },
+    {
+      "epoch": 0.4397333333333333,
+      "grad_norm": 0.3620575104377565,
+      "learning_rate": 0.00012416052563433042,
+      "loss": 0.6358,
+      "step": 1649
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.34801802409283183,
+      "learning_rate": 0.000124076696951036,
+      "loss": 0.636,
+      "step": 1650
+    },
+    {
+      "epoch": 0.44026666666666664,
+      "grad_norm": 0.33453043806639926,
+      "learning_rate": 0.00012399285030345302,
+      "loss": 0.6469,
+      "step": 1651
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.3693898167794525,
+      "learning_rate": 0.00012390898575414177,
+      "loss": 0.6425,
+      "step": 1652
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.34613870679340925,
+      "learning_rate": 0.00012382510336567592,
+      "loss": 0.6445,
+      "step": 1653
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.35515983656786543,
+      "learning_rate": 0.00012374120320064242,
+      "loss": 0.6398,
+      "step": 1654
+    },
+    {
+      "epoch": 0.44133333333333336,
+      "grad_norm": 0.3549805974412868,
+      "learning_rate": 0.0001236572853216415,
+      "loss": 0.6044,
+      "step": 1655
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.35548373229040764,
+      "learning_rate": 0.0001235733497912866,
+      "loss": 0.6176,
+      "step": 1656
+    },
+    {
+      "epoch": 0.4418666666666667,
+      "grad_norm": 0.3595147339678269,
+      "learning_rate": 0.00012348939667220437,
+      "loss": 0.5967,
+      "step": 1657
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.3612244276840959,
+      "learning_rate": 0.00012340542602703455,
+      "loss": 0.6253,
+      "step": 1658
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.3588165377072431,
+      "learning_rate": 0.00012332143791842992,
+      "loss": 0.6079,
+      "step": 1659
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.3789856812517794,
+      "learning_rate": 0.00012323743240905634,
+      "loss": 0.6567,
+      "step": 1660
+    },
+    {
+      "epoch": 0.44293333333333335,
+      "grad_norm": 0.43464739139017444,
+      "learning_rate": 0.00012315340956159265,
+      "loss": 0.615,
+      "step": 1661
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.3434686435471745,
+      "learning_rate": 0.0001230693694387306,
+      "loss": 0.6214,
+      "step": 1662
+    },
+    {
+      "epoch": 0.4434666666666667,
+      "grad_norm": 0.333891109589993,
+      "learning_rate": 0.00012298531210317486,
+      "loss": 0.654,
+      "step": 1663
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.402981975795202,
+      "learning_rate": 0.00012290123761764295,
+      "loss": 0.603,
+      "step": 1664
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.33075643690214307,
+      "learning_rate": 0.0001228171460448652,
+      "loss": 0.6214,
+      "step": 1665
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.35849018879941275,
+      "learning_rate": 0.00012273303744758454,
+      "loss": 0.6595,
+      "step": 1666
+    },
+    {
+      "epoch": 0.44453333333333334,
+      "grad_norm": 0.341626725365342,
+      "learning_rate": 0.00012264891188855677,
+      "loss": 0.6932,
+      "step": 1667
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3379362954056055,
+      "learning_rate": 0.0001225647694305503,
+      "loss": 0.636,
+      "step": 1668
+    },
+    {
+      "epoch": 0.44506666666666667,
+      "grad_norm": 0.3624029234465875,
+      "learning_rate": 0.00012248061013634618,
+      "loss": 0.6605,
+      "step": 1669
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.350033178585205,
+      "learning_rate": 0.00012239643406873792,
+      "loss": 0.6104,
+      "step": 1670
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.3388761621631631,
+      "learning_rate": 0.00012231224129053163,
+      "loss": 0.5925,
+      "step": 1671
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.3534446396927527,
+      "learning_rate": 0.0001222280318645459,
+      "loss": 0.5733,
+      "step": 1672
+    },
+    {
+      "epoch": 0.4461333333333333,
+      "grad_norm": 0.37402065822247554,
+      "learning_rate": 0.00012214380585361166,
+      "loss": 0.6791,
+      "step": 1673
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3574677235990412,
+      "learning_rate": 0.0001220595633205723,
+      "loss": 0.6748,
+      "step": 1674
+    },
+    {
+      "epoch": 0.44666666666666666,
+      "grad_norm": 0.34601388649427123,
+      "learning_rate": 0.00012197530432828348,
+      "loss": 0.6189,
+      "step": 1675
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.3501706946058701,
+      "learning_rate": 0.00012189102893961317,
+      "loss": 0.6495,
+      "step": 1676
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.35359241511647227,
+      "learning_rate": 0.00012180673721744156,
+      "loss": 0.6037,
+      "step": 1677
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.34561338141332965,
+      "learning_rate": 0.00012172242922466103,
+      "loss": 0.6455,
+      "step": 1678
+    },
+    {
+      "epoch": 0.4477333333333333,
+      "grad_norm": 0.35355188513544344,
+      "learning_rate": 0.00012163810502417611,
+      "loss": 0.6346,
+      "step": 1679
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3814744660532964,
+      "learning_rate": 0.0001215537646789034,
+      "loss": 0.6075,
+      "step": 1680
+    },
+    {
+      "epoch": 0.44826666666666665,
+      "grad_norm": 0.3579179619374581,
+      "learning_rate": 0.00012146940825177158,
+      "loss": 0.623,
+      "step": 1681
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.3422599240830082,
+      "learning_rate": 0.0001213850358057213,
+      "loss": 0.6525,
+      "step": 1682
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.34462088811090014,
+      "learning_rate": 0.00012130064740370517,
+      "loss": 0.6662,
+      "step": 1683
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.3254610992854232,
+      "learning_rate": 0.00012121624310868773,
+      "loss": 0.6245,
+      "step": 1684
+    },
+    {
+      "epoch": 0.4493333333333333,
+      "grad_norm": 0.348062684925965,
+      "learning_rate": 0.00012113182298364533,
+      "loss": 0.6503,
+      "step": 1685
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.34043658203295085,
+      "learning_rate": 0.00012104738709156615,
+      "loss": 0.6134,
+      "step": 1686
+    },
+    {
+      "epoch": 0.4498666666666667,
+      "grad_norm": 0.34879711942344566,
+      "learning_rate": 0.00012096293549545017,
+      "loss": 0.6362,
+      "step": 1687
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.34231886384769017,
+      "learning_rate": 0.00012087846825830902,
+      "loss": 0.6632,
+      "step": 1688
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.35026029746962084,
+      "learning_rate": 0.0001207939854431661,
+      "loss": 0.6335,
+      "step": 1689
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.350617316823054,
+      "learning_rate": 0.0001207094871130563,
+      "loss": 0.6261,
+      "step": 1690
+    },
+    {
+      "epoch": 0.45093333333333335,
+      "grad_norm": 0.3345279027657184,
+      "learning_rate": 0.0001206249733310262,
+      "loss": 0.6488,
+      "step": 1691
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.33078409540811543,
+      "learning_rate": 0.00012054044416013388,
+      "loss": 0.6113,
+      "step": 1692
+    },
+    {
+      "epoch": 0.4514666666666667,
+      "grad_norm": 0.3667361800254136,
+      "learning_rate": 0.00012045589966344884,
+      "loss": 0.6929,
+      "step": 1693
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.34426405973469115,
+      "learning_rate": 0.00012037133990405209,
+      "loss": 0.6006,
+      "step": 1694
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.35922862308746506,
+      "learning_rate": 0.00012028676494503602,
+      "loss": 0.6591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.3718053950465859,
+      "learning_rate": 0.00012020217484950434,
+      "loss": 0.623,
+      "step": 1696
+    },
+    {
+      "epoch": 0.45253333333333334,
+      "grad_norm": 0.3488618846207442,
+      "learning_rate": 0.00012011756968057202,
+      "loss": 0.6142,
+      "step": 1697
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3464279258634417,
+      "learning_rate": 0.00012003294950136531,
+      "loss": 0.5909,
+      "step": 1698
+    },
+    {
+      "epoch": 0.4530666666666667,
+      "grad_norm": 0.3480304555655408,
+      "learning_rate": 0.00011994831437502173,
+      "loss": 0.6487,
+      "step": 1699
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.3572698025800835,
+      "learning_rate": 0.00011986366436468985,
+      "loss": 0.6558,
+      "step": 1700
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.3445904125255456,
+      "learning_rate": 0.00011977899953352935,
+      "loss": 0.5715,
+      "step": 1701
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.36293733847573917,
+      "learning_rate": 0.00011969431994471103,
+      "loss": 0.6426,
+      "step": 1702
+    },
+    {
+      "epoch": 0.45413333333333333,
+      "grad_norm": 0.3480435846794133,
+      "learning_rate": 0.00011960962566141666,
+      "loss": 0.6391,
+      "step": 1703
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3445215698334862,
+      "learning_rate": 0.00011952491674683901,
+      "loss": 0.6097,
+      "step": 1704
+    },
+    {
+      "epoch": 0.45466666666666666,
+      "grad_norm": 0.34893762531055356,
+      "learning_rate": 0.0001194401932641817,
+      "loss": 0.6539,
+      "step": 1705
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.3426041167588452,
+      "learning_rate": 0.00011935545527665928,
+      "loss": 0.5759,
+      "step": 1706
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.3663819775175573,
+      "learning_rate": 0.00011927070284749708,
+      "loss": 0.6282,
+      "step": 1707
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.34182320319611004,
+      "learning_rate": 0.0001191859360399313,
+      "loss": 0.6234,
+      "step": 1708
+    },
+    {
+      "epoch": 0.4557333333333333,
+      "grad_norm": 0.36838546552480844,
+      "learning_rate": 0.0001191011549172087,
+      "loss": 0.627,
+      "step": 1709
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.34560021152414283,
+      "learning_rate": 0.00011901635954258688,
+      "loss": 0.5874,
+      "step": 1710
+    },
+    {
+      "epoch": 0.45626666666666665,
+      "grad_norm": 0.3412508047279534,
+      "learning_rate": 0.00011893154997933398,
+      "loss": 0.6117,
+      "step": 1711
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.35030211136931205,
+      "learning_rate": 0.00011884672629072882,
+      "loss": 0.6545,
+      "step": 1712
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.4041961222843157,
+      "learning_rate": 0.0001187618885400606,
+      "loss": 0.6445,
+      "step": 1713
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.3469558132961199,
+      "learning_rate": 0.00011867703679062915,
+      "loss": 0.623,
+      "step": 1714
+    },
+    {
+      "epoch": 0.4573333333333333,
+      "grad_norm": 0.3757630011816648,
+      "learning_rate": 0.00011859217110574475,
+      "loss": 0.6392,
+      "step": 1715
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3183113537973486,
+      "learning_rate": 0.00011850729154872797,
+      "loss": 0.5802,
+      "step": 1716
+    },
+    {
+      "epoch": 0.45786666666666664,
+      "grad_norm": 0.3488258154667218,
+      "learning_rate": 0.0001184223981829098,
+      "loss": 0.637,
+      "step": 1717
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.3557434912545583,
+      "learning_rate": 0.00011833749107163156,
+      "loss": 0.6552,
+      "step": 1718
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.34513655114089387,
+      "learning_rate": 0.00011825257027824481,
+      "loss": 0.6212,
+      "step": 1719
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.33245413565433296,
+      "learning_rate": 0.00011816763586611121,
+      "loss": 0.6141,
+      "step": 1720
+    },
+    {
+      "epoch": 0.45893333333333336,
+      "grad_norm": 0.3688351242992989,
+      "learning_rate": 0.00011808268789860273,
+      "loss": 0.6184,
+      "step": 1721
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.3760740691404871,
+      "learning_rate": 0.00011799772643910137,
+      "loss": 0.6183,
+      "step": 1722
+    },
+    {
+      "epoch": 0.4594666666666667,
+      "grad_norm": 0.3459246535954564,
+      "learning_rate": 0.00011791275155099928,
+      "loss": 0.6185,
+      "step": 1723
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.3794025437279472,
+      "learning_rate": 0.0001178277632976985,
+      "loss": 0.634,
+      "step": 1724
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.3356725232679871,
+      "learning_rate": 0.00011774276174261111,
+      "loss": 0.6101,
+      "step": 1725
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.3638540939453313,
+      "learning_rate": 0.00011765774694915917,
+      "loss": 0.629,
+      "step": 1726
+    },
+    {
+      "epoch": 0.46053333333333335,
+      "grad_norm": 0.3739281041623599,
+      "learning_rate": 0.00011757271898077455,
+      "loss": 0.6349,
+      "step": 1727
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3506253837609906,
+      "learning_rate": 0.00011748767790089896,
+      "loss": 0.6114,
+      "step": 1728
+    },
+    {
+      "epoch": 0.4610666666666667,
+      "grad_norm": 0.3563669109528651,
+      "learning_rate": 0.00011740262377298389,
+      "loss": 0.6348,
+      "step": 1729
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.34733398880304556,
+      "learning_rate": 0.00011731755666049059,
+      "loss": 0.6027,
+      "step": 1730
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.3539341602831934,
+      "learning_rate": 0.00011723247662688999,
+      "loss": 0.6052,
+      "step": 1731
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.3707876295803424,
+      "learning_rate": 0.00011714738373566261,
+      "loss": 0.6527,
+      "step": 1732
+    },
+    {
+      "epoch": 0.46213333333333334,
+      "grad_norm": 0.3646423466641055,
+      "learning_rate": 0.00011706227805029863,
+      "loss": 0.6571,
+      "step": 1733
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.33711712350098677,
+      "learning_rate": 0.00011697715963429777,
+      "loss": 0.616,
+      "step": 1734
+    },
+    {
+      "epoch": 0.46266666666666667,
+      "grad_norm": 0.3451068630337151,
+      "learning_rate": 0.0001168920285511692,
+      "loss": 0.6447,
+      "step": 1735
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.3472717885125027,
+      "learning_rate": 0.00011680688486443161,
+      "loss": 0.6075,
+      "step": 1736
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.3624518453803178,
+      "learning_rate": 0.00011672172863761301,
+      "loss": 0.6025,
+      "step": 1737
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.3672398998733144,
+      "learning_rate": 0.00011663655993425086,
+      "loss": 0.5607,
+      "step": 1738
+    },
+    {
+      "epoch": 0.46373333333333333,
+      "grad_norm": 0.3498099392730004,
+      "learning_rate": 0.00011655137881789187,
+      "loss": 0.6097,
+      "step": 1739
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.34031625607024807,
+      "learning_rate": 0.000116466185352092,
+      "loss": 0.6012,
+      "step": 1740
+    },
+    {
+      "epoch": 0.46426666666666666,
+      "grad_norm": 0.3333234723759394,
+      "learning_rate": 0.00011638097960041646,
+      "loss": 0.6483,
+      "step": 1741
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.3742362318618192,
+      "learning_rate": 0.0001162957616264396,
+      "loss": 0.6419,
+      "step": 1742
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.36115062026580963,
+      "learning_rate": 0.00011621053149374492,
+      "loss": 0.6077,
+      "step": 1743
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.3961085014676561,
+      "learning_rate": 0.00011612528926592499,
+      "loss": 0.6623,
+      "step": 1744
+    },
+    {
+      "epoch": 0.4653333333333333,
+      "grad_norm": 0.3504639635390535,
+      "learning_rate": 0.00011604003500658135,
+      "loss": 0.6324,
+      "step": 1745
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.36082443526527486,
+      "learning_rate": 0.0001159547687793246,
+      "loss": 0.5789,
+      "step": 1746
+    },
+    {
+      "epoch": 0.46586666666666665,
+      "grad_norm": 0.3419926151679195,
+      "learning_rate": 0.00011586949064777424,
+      "loss": 0.6101,
+      "step": 1747
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.34934744773225196,
+      "learning_rate": 0.0001157842006755586,
+      "loss": 0.612,
+      "step": 1748
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.34795307189211017,
+      "learning_rate": 0.00011569889892631487,
+      "loss": 0.626,
+      "step": 1749
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.3565926984174023,
+      "learning_rate": 0.00011561358546368905,
+      "loss": 0.5923,
+      "step": 1750
+    },
+    {
+      "epoch": 0.4669333333333333,
+      "grad_norm": 0.3302871232944887,
+      "learning_rate": 0.00011552826035133594,
+      "loss": 0.6165,
+      "step": 1751
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.34648386915756985,
+      "learning_rate": 0.00011544292365291889,
+      "loss": 0.6059,
+      "step": 1752
+    },
+    {
+      "epoch": 0.46746666666666664,
+      "grad_norm": 0.3357397199945023,
+      "learning_rate": 0.00011535757543210995,
+      "loss": 0.6072,
+      "step": 1753
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.3811455486863015,
+      "learning_rate": 0.00011527221575258984,
+      "loss": 0.631,
+      "step": 1754
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.3450542452834387,
+      "learning_rate": 0.00011518684467804777,
+      "loss": 0.6007,
+      "step": 1755
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.3441023171434445,
+      "learning_rate": 0.00011510146227218141,
+      "loss": 0.5751,
+      "step": 1756
+    },
+    {
+      "epoch": 0.46853333333333336,
+      "grad_norm": 0.3700175540134214,
+      "learning_rate": 0.000115016068598697,
+      "loss": 0.6554,
+      "step": 1757
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4034366157451916,
+      "learning_rate": 0.00011493066372130907,
+      "loss": 0.6107,
+      "step": 1758
+    },
+    {
+      "epoch": 0.4690666666666667,
+      "grad_norm": 0.34852860508031075,
+      "learning_rate": 0.00011484524770374056,
+      "loss": 0.618,
+      "step": 1759
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.34762024645445466,
+      "learning_rate": 0.00011475982060972273,
+      "loss": 0.6791,
+      "step": 1760
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.36750238109278827,
+      "learning_rate": 0.00011467438250299509,
+      "loss": 0.6252,
+      "step": 1761
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.34093990152993797,
+      "learning_rate": 0.0001145889334473054,
+      "loss": 0.6111,
+      "step": 1762
+    },
+    {
+      "epoch": 0.47013333333333335,
+      "grad_norm": 0.34826964138058425,
+      "learning_rate": 0.00011450347350640948,
+      "loss": 0.6058,
+      "step": 1763
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3488992313334934,
+      "learning_rate": 0.00011441800274407146,
+      "loss": 0.5873,
+      "step": 1764
+    },
+    {
+      "epoch": 0.4706666666666667,
+      "grad_norm": 0.40354408907069983,
+      "learning_rate": 0.00011433252122406334,
+      "loss": 0.6472,
+      "step": 1765
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.3704286016903338,
+      "learning_rate": 0.00011424702901016533,
+      "loss": 0.6075,
+      "step": 1766
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.36193346339895377,
+      "learning_rate": 0.00011416152616616547,
+      "loss": 0.6206,
+      "step": 1767
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.35700756996017685,
+      "learning_rate": 0.00011407601275585981,
+      "loss": 0.6906,
+      "step": 1768
+    },
+    {
+      "epoch": 0.47173333333333334,
+      "grad_norm": 0.3574041314252541,
+      "learning_rate": 0.00011399048884305226,
+      "loss": 0.6414,
+      "step": 1769
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.3520379979890098,
+      "learning_rate": 0.0001139049544915546,
+      "loss": 0.6065,
+      "step": 1770
+    },
+    {
+      "epoch": 0.47226666666666667,
+      "grad_norm": 0.3357214424657432,
+      "learning_rate": 0.00011381940976518634,
+      "loss": 0.5982,
+      "step": 1771
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.33496458510409605,
+      "learning_rate": 0.00011373385472777478,
+      "loss": 0.5652,
+      "step": 1772
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.36174083734164647,
+      "learning_rate": 0.00011364828944315489,
+      "loss": 0.6335,
+      "step": 1773
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.34477653403624764,
+      "learning_rate": 0.0001135627139751693,
+      "loss": 0.6089,
+      "step": 1774
+    },
+    {
+      "epoch": 0.47333333333333333,
+      "grad_norm": 0.37113886814844055,
+      "learning_rate": 0.00011347712838766824,
+      "loss": 0.6395,
+      "step": 1775
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3557849084199261,
+      "learning_rate": 0.00011339153274450945,
+      "loss": 0.6139,
+      "step": 1776
+    },
+    {
+      "epoch": 0.47386666666666666,
+      "grad_norm": 0.34053610185585476,
+      "learning_rate": 0.00011330592710955823,
+      "loss": 0.5975,
+      "step": 1777
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.3541955335602447,
+      "learning_rate": 0.00011322031154668731,
+      "loss": 0.6165,
+      "step": 1778
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.34643727950645714,
+      "learning_rate": 0.00011313468611977678,
+      "loss": 0.656,
+      "step": 1779
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.3460536062176225,
+      "learning_rate": 0.00011304905089271418,
+      "loss": 0.6254,
+      "step": 1780
+    },
+    {
+      "epoch": 0.4749333333333333,
+      "grad_norm": 0.3336864302507506,
+      "learning_rate": 0.0001129634059293943,
+      "loss": 0.6259,
+      "step": 1781
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.35071888371097154,
+      "learning_rate": 0.00011287775129371925,
+      "loss": 0.6135,
+      "step": 1782
+    },
+    {
+      "epoch": 0.47546666666666665,
+      "grad_norm": 0.34251890451142475,
+      "learning_rate": 0.00011279208704959827,
+      "loss": 0.5884,
+      "step": 1783
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.4198573769517053,
+      "learning_rate": 0.00011270641326094784,
+      "loss": 0.6486,
+      "step": 1784
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.3733782715545972,
+      "learning_rate": 0.00011262072999169155,
+      "loss": 0.6619,
+      "step": 1785
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.3636678351224484,
+      "learning_rate": 0.00011253503730576005,
+      "loss": 0.6435,
+      "step": 1786
+    },
+    {
+      "epoch": 0.4765333333333333,
+      "grad_norm": 0.3722293655153486,
+      "learning_rate": 0.000112449335267091,
+      "loss": 0.7063,
+      "step": 1787
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4045494203049154,
+      "learning_rate": 0.00011236362393962907,
+      "loss": 0.6889,
+      "step": 1788
+    },
+    {
+      "epoch": 0.4770666666666667,
+      "grad_norm": 0.41141364057819396,
+      "learning_rate": 0.00011227790338732584,
+      "loss": 0.6454,
+      "step": 1789
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.35524714004015817,
+      "learning_rate": 0.00011219217367413979,
+      "loss": 0.6594,
+      "step": 1790
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.3548249027270856,
+      "learning_rate": 0.00011210643486403622,
+      "loss": 0.6848,
+      "step": 1791
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.34990685689062734,
+      "learning_rate": 0.00011202068702098725,
+      "loss": 0.6414,
+      "step": 1792
+    },
+    {
+      "epoch": 0.47813333333333335,
+      "grad_norm": 0.36062907826450685,
+      "learning_rate": 0.00011193493020897173,
+      "loss": 0.7016,
+      "step": 1793
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.3657531065016614,
+      "learning_rate": 0.00011184916449197509,
+      "loss": 0.6301,
+      "step": 1794
+    },
+    {
+      "epoch": 0.4786666666666667,
+      "grad_norm": 0.3427264254470834,
+      "learning_rate": 0.00011176338993398958,
+      "loss": 0.6625,
+      "step": 1795
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.3582835531352709,
+      "learning_rate": 0.00011167760659901396,
+      "loss": 0.6304,
+      "step": 1796
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.37796506398892793,
+      "learning_rate": 0.00011159181455105354,
+      "loss": 0.638,
+      "step": 1797
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.36483684177211584,
+      "learning_rate": 0.0001115060138541201,
+      "loss": 0.679,
+      "step": 1798
+    },
+    {
+      "epoch": 0.47973333333333334,
+      "grad_norm": 0.4053911364643288,
+      "learning_rate": 0.00011142020457223194,
+      "loss": 0.6475,
+      "step": 1799
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3429016214299657,
+      "learning_rate": 0.00011133438676941374,
+      "loss": 0.6388,
+      "step": 1800
+    },
+    {
+      "epoch": 0.4802666666666667,
+      "grad_norm": 0.3328225323854466,
+      "learning_rate": 0.00011124856050969656,
+      "loss": 0.6086,
+      "step": 1801
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.3759503815104567,
+      "learning_rate": 0.00011116272585711772,
+      "loss": 0.6549,
+      "step": 1802
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.3816139449550581,
+      "learning_rate": 0.00011107688287572075,
+      "loss": 0.6584,
+      "step": 1803
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.3488440642535208,
+      "learning_rate": 0.00011099103162955558,
+      "loss": 0.6516,
+      "step": 1804
+    },
+    {
+      "epoch": 0.48133333333333334,
+      "grad_norm": 0.31898300396226115,
+      "learning_rate": 0.00011090517218267817,
+      "loss": 0.5368,
+      "step": 1805
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3304373238663184,
+      "learning_rate": 0.00011081930459915057,
+      "loss": 0.5734,
+      "step": 1806
+    },
+    {
+      "epoch": 0.48186666666666667,
+      "grad_norm": 0.36577036159963267,
+      "learning_rate": 0.000110733428943041,
+      "loss": 0.6458,
+      "step": 1807
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.34418727674948435,
+      "learning_rate": 0.00011064754527842365,
+      "loss": 0.6379,
+      "step": 1808
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.3485730493230742,
+      "learning_rate": 0.00011056165366937868,
+      "loss": 0.6747,
+      "step": 1809
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.3579638159305423,
+      "learning_rate": 0.00011047575417999221,
+      "loss": 0.6519,
+      "step": 1810
+    },
+    {
+      "epoch": 0.4829333333333333,
+      "grad_norm": 0.37974716507113593,
+      "learning_rate": 0.00011038984687435624,
+      "loss": 0.612,
+      "step": 1811
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3853979813831866,
+      "learning_rate": 0.00011030393181656853,
+      "loss": 0.6636,
+      "step": 1812
+    },
+    {
+      "epoch": 0.48346666666666666,
+      "grad_norm": 0.3563252982196125,
+      "learning_rate": 0.00011021800907073274,
+      "loss": 0.616,
+      "step": 1813
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.33341699980169726,
+      "learning_rate": 0.00011013207870095817,
+      "loss": 0.6165,
+      "step": 1814
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.3495071001339214,
+      "learning_rate": 0.00011004614077135982,
+      "loss": 0.643,
+      "step": 1815
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.3562240275221454,
+      "learning_rate": 0.00010996019534605839,
+      "loss": 0.6336,
+      "step": 1816
+    },
+    {
+      "epoch": 0.4845333333333333,
+      "grad_norm": 0.36402598215189086,
+      "learning_rate": 0.00010987424248918013,
+      "loss": 0.6811,
+      "step": 1817
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.33606282717664393,
+      "learning_rate": 0.0001097882822648568,
+      "loss": 0.603,
+      "step": 1818
+    },
+    {
+      "epoch": 0.48506666666666665,
+      "grad_norm": 0.3520338401948451,
+      "learning_rate": 0.00010970231473722576,
+      "loss": 0.6221,
+      "step": 1819
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.3395677931780785,
+      "learning_rate": 0.00010961633997042973,
+      "loss": 0.6106,
+      "step": 1820
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.3336757458331368,
+      "learning_rate": 0.00010953035802861686,
+      "loss": 0.5563,
+      "step": 1821
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.34310758385784323,
+      "learning_rate": 0.00010944436897594064,
+      "loss": 0.5801,
+      "step": 1822
+    },
+    {
+      "epoch": 0.4861333333333333,
+      "grad_norm": 0.33464032436838764,
+      "learning_rate": 0.00010935837287655986,
+      "loss": 0.6048,
+      "step": 1823
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.31986865601205045,
+      "learning_rate": 0.00010927236979463862,
+      "loss": 0.5787,
+      "step": 1824
+    },
+    {
+      "epoch": 0.4866666666666667,
+      "grad_norm": 0.34685655386156217,
+      "learning_rate": 0.00010918635979434622,
+      "loss": 0.6008,
+      "step": 1825
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.3511924912830929,
+      "learning_rate": 0.00010910034293985701,
+      "loss": 0.6118,
+      "step": 1826
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.4194661149245277,
+      "learning_rate": 0.0001090143192953506,
+      "loss": 0.6495,
+      "step": 1827
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.3456593638418228,
+      "learning_rate": 0.00010892828892501161,
+      "loss": 0.6416,
+      "step": 1828
+    },
+    {
+      "epoch": 0.48773333333333335,
+      "grad_norm": 0.35914563192748006,
+      "learning_rate": 0.00010884225189302968,
+      "loss": 0.6221,
+      "step": 1829
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.3420030648648187,
+      "learning_rate": 0.00010875620826359937,
+      "loss": 0.5958,
+      "step": 1830
+    },
+    {
+      "epoch": 0.4882666666666667,
+      "grad_norm": 0.34803579730104184,
+      "learning_rate": 0.00010867015810092026,
+      "loss": 0.584,
+      "step": 1831
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.33939694787610614,
+      "learning_rate": 0.00010858410146919674,
+      "loss": 0.6265,
+      "step": 1832
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.35164434152982926,
+      "learning_rate": 0.00010849803843263802,
+      "loss": 0.6404,
+      "step": 1833
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.34503349278235446,
+      "learning_rate": 0.0001084119690554581,
+      "loss": 0.6143,
+      "step": 1834
+    },
+    {
+      "epoch": 0.48933333333333334,
+      "grad_norm": 0.3593120141383835,
+      "learning_rate": 0.00010832589340187573,
+      "loss": 0.6367,
+      "step": 1835
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3435343643175995,
+      "learning_rate": 0.00010823981153611438,
+      "loss": 0.586,
+      "step": 1836
+    },
+    {
+      "epoch": 0.4898666666666667,
+      "grad_norm": 0.3829802288187017,
+      "learning_rate": 0.00010815372352240203,
+      "loss": 0.6401,
+      "step": 1837
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.5504776432734196,
+      "learning_rate": 0.00010806762942497138,
+      "loss": 0.6456,
+      "step": 1838
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.3432403237218622,
+      "learning_rate": 0.00010798152930805958,
+      "loss": 0.6529,
+      "step": 1839
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.3362221768777311,
+      "learning_rate": 0.00010789542323590838,
+      "loss": 0.579,
+      "step": 1840
+    },
+    {
+      "epoch": 0.49093333333333333,
+      "grad_norm": 0.3523883313706826,
+      "learning_rate": 0.00010780931127276379,
+      "loss": 0.6728,
+      "step": 1841
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.3671914793983141,
+      "learning_rate": 0.00010772319348287638,
+      "loss": 0.6451,
+      "step": 1842
+    },
+    {
+      "epoch": 0.49146666666666666,
+      "grad_norm": 0.34033011515199435,
+      "learning_rate": 0.000107637069930501,
+      "loss": 0.645,
+      "step": 1843
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.3602503510963793,
+      "learning_rate": 0.00010755094067989684,
+      "loss": 0.6383,
+      "step": 1844
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.35038007368810054,
+      "learning_rate": 0.00010746480579532727,
+      "loss": 0.6729,
+      "step": 1845
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.3473701853281187,
+      "learning_rate": 0.00010737866534105993,
+      "loss": 0.5375,
+      "step": 1846
+    },
+    {
+      "epoch": 0.4925333333333333,
+      "grad_norm": 0.3708167977397802,
+      "learning_rate": 0.0001072925193813666,
+      "loss": 0.66,
+      "step": 1847
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4137343485050253,
+      "learning_rate": 0.00010720636798052314,
+      "loss": 0.6487,
+      "step": 1848
+    },
+    {
+      "epoch": 0.49306666666666665,
+      "grad_norm": 0.35047352190009584,
+      "learning_rate": 0.0001071202112028095,
+      "loss": 0.6351,
+      "step": 1849
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.3428848289584383,
+      "learning_rate": 0.00010703404911250962,
+      "loss": 0.6065,
+      "step": 1850
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.3712163179708831,
+      "learning_rate": 0.00010694788177391145,
+      "loss": 0.6395,
+      "step": 1851
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.34258943995828145,
+      "learning_rate": 0.00010686170925130678,
+      "loss": 0.5836,
+      "step": 1852
+    },
+    {
+      "epoch": 0.4941333333333333,
+      "grad_norm": 0.3634496024934811,
+      "learning_rate": 0.00010677553160899135,
+      "loss": 0.5987,
+      "step": 1853
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3494288944374076,
+      "learning_rate": 0.00010668934891126458,
+      "loss": 0.6173,
+      "step": 1854
+    },
+    {
+      "epoch": 0.49466666666666664,
+      "grad_norm": 0.3858477887669424,
+      "learning_rate": 0.00010660316122242988,
+      "loss": 0.6689,
+      "step": 1855
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.3316806723574418,
+      "learning_rate": 0.00010651696860679425,
+      "loss": 0.6091,
+      "step": 1856
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.34001195560173475,
+      "learning_rate": 0.00010643077112866831,
+      "loss": 0.5693,
+      "step": 1857
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.3730457710225014,
+      "learning_rate": 0.00010634456885236643,
+      "loss": 0.6383,
+      "step": 1858
+    },
+    {
+      "epoch": 0.49573333333333336,
+      "grad_norm": 0.3529477327900418,
+      "learning_rate": 0.0001062583618422065,
+      "loss": 0.6602,
+      "step": 1859
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.35275175353116595,
+      "learning_rate": 0.00010617215016250996,
+      "loss": 0.585,
+      "step": 1860
+    },
+    {
+      "epoch": 0.4962666666666667,
+      "grad_norm": 0.3436185227798171,
+      "learning_rate": 0.00010608593387760171,
+      "loss": 0.6216,
+      "step": 1861
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.3403844645528916,
+      "learning_rate": 0.00010599971305181012,
+      "loss": 0.6036,
+      "step": 1862
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.3765236264940643,
+      "learning_rate": 0.00010591348774946687,
+      "loss": 0.6681,
+      "step": 1863
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.3739579366865447,
+      "learning_rate": 0.00010582725803490714,
+      "loss": 0.6859,
+      "step": 1864
+    },
+    {
+      "epoch": 0.49733333333333335,
+      "grad_norm": 0.3413576702802035,
+      "learning_rate": 0.00010574102397246921,
+      "loss": 0.6093,
+      "step": 1865
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.35345468868149615,
+      "learning_rate": 0.00010565478562649476,
+      "loss": 0.6227,
+      "step": 1866
+    },
+    {
+      "epoch": 0.4978666666666667,
+      "grad_norm": 0.3588273500333482,
+      "learning_rate": 0.00010556854306132855,
+      "loss": 0.6199,
+      "step": 1867
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.3866091359716914,
+      "learning_rate": 0.00010548229634131858,
+      "loss": 0.6377,
+      "step": 1868
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.3240242407888893,
+      "learning_rate": 0.0001053960455308159,
+      "loss": 0.5541,
+      "step": 1869
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.3477715307092799,
+      "learning_rate": 0.00010530979069417461,
+      "loss": 0.5971,
+      "step": 1870
+    },
+    {
+      "epoch": 0.49893333333333334,
+      "grad_norm": 0.34168835161930505,
+      "learning_rate": 0.00010522353189575183,
+      "loss": 0.6419,
+      "step": 1871
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3530478591879385,
+      "learning_rate": 0.00010513726919990762,
+      "loss": 0.6347,
+      "step": 1872
+    },
+    {
+      "epoch": 0.49946666666666667,
+      "grad_norm": 0.32991880649694866,
+      "learning_rate": 0.000105051002671005,
+      "loss": 0.5562,
+      "step": 1873
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.340019310969109,
+      "learning_rate": 0.00010496473237340978,
+      "loss": 0.6271,
+      "step": 1874
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.3403011764625951,
+      "learning_rate": 0.00010487845837149062,
+      "loss": 0.5954,
+      "step": 1875
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.3404496729556034,
+      "learning_rate": 0.00010479218072961892,
+      "loss": 0.6603,
+      "step": 1876
+    },
+    {
+      "epoch": 0.5005333333333334,
+      "grad_norm": 0.34784316782713803,
+      "learning_rate": 0.00010470589951216882,
+      "loss": 0.6372,
+      "step": 1877
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.3468042077926638,
+      "learning_rate": 0.00010461961478351711,
+      "loss": 0.6162,
+      "step": 1878
+    },
+    {
+      "epoch": 0.5010666666666667,
+      "grad_norm": 0.3441622189263432,
+      "learning_rate": 0.00010453332660804327,
+      "loss": 0.5968,
+      "step": 1879
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.36661564652960343,
+      "learning_rate": 0.0001044470350501292,
+      "loss": 0.6095,
+      "step": 1880
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.3461679275941061,
+      "learning_rate": 0.00010436074017415947,
+      "loss": 0.6265,
+      "step": 1881
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.3409084709412113,
+      "learning_rate": 0.00010427444204452103,
+      "loss": 0.5959,
+      "step": 1882
+    },
+    {
+      "epoch": 0.5021333333333333,
+      "grad_norm": 0.3719425555366416,
+      "learning_rate": 0.00010418814072560337,
+      "loss": 0.6138,
+      "step": 1883
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.33504316439542964,
+      "learning_rate": 0.00010410183628179822,
+      "loss": 0.6303,
+      "step": 1884
+    },
+    {
+      "epoch": 0.5026666666666667,
+      "grad_norm": 0.3589984051590842,
+      "learning_rate": 0.00010401552877749973,
+      "loss": 0.5878,
+      "step": 1885
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.3561547910470736,
+      "learning_rate": 0.00010392921827710432,
+      "loss": 0.6125,
+      "step": 1886
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.3381142821458892,
+      "learning_rate": 0.00010384290484501064,
+      "loss": 0.5954,
+      "step": 1887
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.3514335414345356,
+      "learning_rate": 0.00010375658854561952,
+      "loss": 0.5902,
+      "step": 1888
+    },
+    {
+      "epoch": 0.5037333333333334,
+      "grad_norm": 0.3316766610681647,
+      "learning_rate": 0.00010367026944333391,
+      "loss": 0.6192,
+      "step": 1889
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.33754447514834407,
+      "learning_rate": 0.00010358394760255892,
+      "loss": 0.6267,
+      "step": 1890
+    },
+    {
+      "epoch": 0.5042666666666666,
+      "grad_norm": 0.3392906742883365,
+      "learning_rate": 0.00010349762308770163,
+      "loss": 0.6284,
+      "step": 1891
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.3303627343601473,
+      "learning_rate": 0.00010341129596317114,
+      "loss": 0.5755,
+      "step": 1892
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.39575470525642015,
+      "learning_rate": 0.00010332496629337854,
+      "loss": 0.6271,
+      "step": 1893
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.3605586831239872,
+      "learning_rate": 0.00010323863414273674,
+      "loss": 0.6174,
+      "step": 1894
+    },
+    {
+      "epoch": 0.5053333333333333,
+      "grad_norm": 0.35256415973273353,
+      "learning_rate": 0.00010315229957566059,
+      "loss": 0.615,
+      "step": 1895
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3424145120737014,
+      "learning_rate": 0.00010306596265656663,
+      "loss": 0.6222,
+      "step": 1896
+    },
+    {
+      "epoch": 0.5058666666666667,
+      "grad_norm": 0.3445887350666713,
+      "learning_rate": 0.00010297962344987326,
+      "loss": 0.6396,
+      "step": 1897
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.3629993823914323,
+      "learning_rate": 0.00010289328202000055,
+      "loss": 0.6737,
+      "step": 1898
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.3665736331797972,
+      "learning_rate": 0.00010280693843137019,
+      "loss": 0.6367,
+      "step": 1899
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.34092626926986974,
+      "learning_rate": 0.00010272059274840555,
+      "loss": 0.5757,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5069333333333333,
+      "grad_norm": 0.3412486703966795,
+      "learning_rate": 0.00010263424503553155,
+      "loss": 0.5863,
+      "step": 1901
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3546342606220995,
+      "learning_rate": 0.00010254789535717456,
+      "loss": 0.6288,
+      "step": 1902
+    },
+    {
+      "epoch": 0.5074666666666666,
+      "grad_norm": 0.3469896420351848,
+      "learning_rate": 0.00010246154377776246,
+      "loss": 0.5865,
+      "step": 1903
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.32606735504146694,
+      "learning_rate": 0.00010237519036172459,
+      "loss": 0.6053,
+      "step": 1904
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.3155514996696917,
+      "learning_rate": 0.00010228883517349154,
+      "loss": 0.5819,
+      "step": 1905
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.36122419725229143,
+      "learning_rate": 0.0001022024782774954,
+      "loss": 0.6305,
+      "step": 1906
+    },
+    {
+      "epoch": 0.5085333333333333,
+      "grad_norm": 0.3758738629272625,
+      "learning_rate": 0.0001021161197381694,
+      "loss": 0.5897,
+      "step": 1907
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3668727531150211,
+      "learning_rate": 0.00010202975961994798,
+      "loss": 0.6725,
+      "step": 1908
+    },
+    {
+      "epoch": 0.5090666666666667,
+      "grad_norm": 0.3683290010864964,
+      "learning_rate": 0.00010194339798726684,
+      "loss": 0.6397,
+      "step": 1909
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.36378539233678114,
+      "learning_rate": 0.0001018570349045628,
+      "loss": 0.5792,
+      "step": 1910
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.35361595482702723,
+      "learning_rate": 0.00010177067043627375,
+      "loss": 0.6149,
+      "step": 1911
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.351717486992583,
+      "learning_rate": 0.00010168430464683856,
+      "loss": 0.6001,
+      "step": 1912
+    },
+    {
+      "epoch": 0.5101333333333333,
+      "grad_norm": 0.3599675743587252,
+      "learning_rate": 0.00010159793760069715,
+      "loss": 0.5971,
+      "step": 1913
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3554409815569149,
+      "learning_rate": 0.0001015115693622904,
+      "loss": 0.6021,
+      "step": 1914
+    },
+    {
+      "epoch": 0.5106666666666667,
+      "grad_norm": 0.3653117929623009,
+      "learning_rate": 0.00010142519999605997,
+      "loss": 0.6292,
+      "step": 1915
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.34022426543298023,
+      "learning_rate": 0.00010133882956644846,
+      "loss": 0.5937,
+      "step": 1916
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.3498122610654033,
+      "learning_rate": 0.00010125245813789923,
+      "loss": 0.6385,
+      "step": 1917
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.3498140175570313,
+      "learning_rate": 0.0001011660857748564,
+      "loss": 0.6576,
+      "step": 1918
+    },
+    {
+      "epoch": 0.5117333333333334,
+      "grad_norm": 0.36061234115656216,
+      "learning_rate": 0.00010107971254176475,
+      "loss": 0.6691,
+      "step": 1919
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.36038800741636573,
+      "learning_rate": 0.00010099333850306977,
+      "loss": 0.6582,
+      "step": 1920
+    },
+    {
+      "epoch": 0.5122666666666666,
+      "grad_norm": 0.3186736897666428,
+      "learning_rate": 0.0001009069637232175,
+      "loss": 0.5633,
+      "step": 1921
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.3465212424096694,
+      "learning_rate": 0.00010082058826665457,
+      "loss": 0.5948,
+      "step": 1922
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.34003543567676386,
+      "learning_rate": 0.00010073421219782804,
+      "loss": 0.6234,
+      "step": 1923
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.34016776681136945,
+      "learning_rate": 0.00010064783558118552,
+      "loss": 0.6,
+      "step": 1924
+    },
+    {
+      "epoch": 0.5133333333333333,
+      "grad_norm": 0.3581297499766351,
+      "learning_rate": 0.00010056145848117497,
+      "loss": 0.6047,
+      "step": 1925
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.3514092449630491,
+      "learning_rate": 0.00010047508096224476,
+      "loss": 0.5896,
+      "step": 1926
+    },
+    {
+      "epoch": 0.5138666666666667,
+      "grad_norm": 0.3242359772997438,
+      "learning_rate": 0.0001003887030888435,
+      "loss": 0.6051,
+      "step": 1927
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.34380597190413853,
+      "learning_rate": 0.00010030232492542014,
+      "loss": 0.5986,
+      "step": 1928
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.3431258525131172,
+      "learning_rate": 0.00010021594653642379,
+      "loss": 0.6541,
+      "step": 1929
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.34331857592804965,
+      "learning_rate": 0.0001001295679863038,
+      "loss": 0.6492,
+      "step": 1930
+    },
+    {
+      "epoch": 0.5149333333333334,
+      "grad_norm": 0.33973108768418797,
+      "learning_rate": 0.00010004318933950953,
+      "loss": 0.6008,
+      "step": 1931
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3446940487143982,
+      "learning_rate": 9.99568106604905e-05,
+      "loss": 0.6176,
+      "step": 1932
+    },
+    {
+      "epoch": 0.5154666666666666,
+      "grad_norm": 0.37330428433218943,
+      "learning_rate": 9.987043201369622e-05,
+      "loss": 0.6337,
+      "step": 1933
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.3396927795752247,
+      "learning_rate": 9.978405346357621e-05,
+      "loss": 0.621,
+      "step": 1934
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.34460019902033295,
+      "learning_rate": 9.969767507457987e-05,
+      "loss": 0.5934,
+      "step": 1935
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.3490440959282949,
+      "learning_rate": 9.961129691115653e-05,
+      "loss": 0.6572,
+      "step": 1936
+    },
+    {
+      "epoch": 0.5165333333333333,
+      "grad_norm": 0.34407504648219744,
+      "learning_rate": 9.952491903775529e-05,
+      "loss": 0.581,
+      "step": 1937
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.34365704470410735,
+      "learning_rate": 9.943854151882505e-05,
+      "loss": 0.59,
+      "step": 1938
+    },
+    {
+      "epoch": 0.5170666666666667,
+      "grad_norm": 0.3582436374601057,
+      "learning_rate": 9.935216441881451e-05,
+      "loss": 0.6189,
+      "step": 1939
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.3845217823198548,
+      "learning_rate": 9.926578780217199e-05,
+      "loss": 0.6089,
+      "step": 1940
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.32116870741455084,
+      "learning_rate": 9.917941173334545e-05,
+      "loss": 0.5844,
+      "step": 1941
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.34636988073483993,
+      "learning_rate": 9.909303627678249e-05,
+      "loss": 0.5766,
+      "step": 1942
+    },
+    {
+      "epoch": 0.5181333333333333,
+      "grad_norm": 0.35704539747501596,
+      "learning_rate": 9.900666149693022e-05,
+      "loss": 0.623,
+      "step": 1943
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3714349645043707,
+      "learning_rate": 9.892028745823526e-05,
+      "loss": 0.6401,
+      "step": 1944
+    },
+    {
+      "epoch": 0.5186666666666667,
+      "grad_norm": 0.3255639284042641,
+      "learning_rate": 9.883391422514362e-05,
+      "loss": 0.5781,
+      "step": 1945
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.3353133665007348,
+      "learning_rate": 9.874754186210078e-05,
+      "loss": 0.612,
+      "step": 1946
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.3669824930961404,
+      "learning_rate": 9.866117043355156e-05,
+      "loss": 0.6301,
+      "step": 1947
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.3342869403864703,
+      "learning_rate": 9.857480000394006e-05,
+      "loss": 0.5668,
+      "step": 1948
+    },
+    {
+      "epoch": 0.5197333333333334,
+      "grad_norm": 0.3921780213818484,
+      "learning_rate": 9.848843063770962e-05,
+      "loss": 0.587,
+      "step": 1949
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.35479788707660537,
+      "learning_rate": 9.840206239930286e-05,
+      "loss": 0.5925,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5202666666666667,
+      "grad_norm": 0.3401804529833413,
+      "learning_rate": 9.831569535316144e-05,
+      "loss": 0.6009,
+      "step": 1951
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.35423663734328054,
+      "learning_rate": 9.82293295637263e-05,
+      "loss": 0.576,
+      "step": 1952
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.3476262381142906,
+      "learning_rate": 9.814296509543724e-05,
+      "loss": 0.5986,
+      "step": 1953
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.38158077018818914,
+      "learning_rate": 9.805660201273317e-05,
+      "loss": 0.7074,
+      "step": 1954
+    },
+    {
+      "epoch": 0.5213333333333333,
+      "grad_norm": 0.32643123888065656,
+      "learning_rate": 9.797024038005204e-05,
+      "loss": 0.5948,
+      "step": 1955
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.34154734922813573,
+      "learning_rate": 9.788388026183063e-05,
+      "loss": 0.6104,
+      "step": 1956
+    },
+    {
+      "epoch": 0.5218666666666667,
+      "grad_norm": 0.33051183124715117,
+      "learning_rate": 9.779752172250461e-05,
+      "loss": 0.5731,
+      "step": 1957
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.3345259852415471,
+      "learning_rate": 9.771116482650844e-05,
+      "loss": 0.5393,
+      "step": 1958
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.3403340455917885,
+      "learning_rate": 9.762480963827546e-05,
+      "loss": 0.5962,
+      "step": 1959
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.34059588784734357,
+      "learning_rate": 9.753845622223757e-05,
+      "loss": 0.6118,
+      "step": 1960
+    },
+    {
+      "epoch": 0.5229333333333334,
+      "grad_norm": 0.3486927682382407,
+      "learning_rate": 9.745210464282548e-05,
+      "loss": 0.6159,
+      "step": 1961
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.34613355506330856,
+      "learning_rate": 9.736575496446848e-05,
+      "loss": 0.593,
+      "step": 1962
+    },
+    {
+      "epoch": 0.5234666666666666,
+      "grad_norm": 0.3269411928179518,
+      "learning_rate": 9.727940725159446e-05,
+      "loss": 0.5678,
+      "step": 1963
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.3321455675862325,
+      "learning_rate": 9.719306156862982e-05,
+      "loss": 0.5968,
+      "step": 1964
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.33771013399845196,
+      "learning_rate": 9.710671797999946e-05,
+      "loss": 0.6129,
+      "step": 1965
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.3460757106477492,
+      "learning_rate": 9.702037655012675e-05,
+      "loss": 0.6263,
+      "step": 1966
+    },
+    {
+      "epoch": 0.5245333333333333,
+      "grad_norm": 0.33148582820188244,
+      "learning_rate": 9.693403734343342e-05,
+      "loss": 0.5713,
+      "step": 1967
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.349995296297823,
+      "learning_rate": 9.684770042433946e-05,
+      "loss": 0.6109,
+      "step": 1968
+    },
+    {
+      "epoch": 0.5250666666666667,
+      "grad_norm": 0.3618880675955362,
+      "learning_rate": 9.676136585726328e-05,
+      "loss": 0.6668,
+      "step": 1969
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.3560408216584042,
+      "learning_rate": 9.667503370662148e-05,
+      "loss": 0.6268,
+      "step": 1970
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.3340972286533265,
+      "learning_rate": 9.658870403682888e-05,
+      "loss": 0.5956,
+      "step": 1971
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.3556410848409868,
+      "learning_rate": 9.65023769122984e-05,
+      "loss": 0.6361,
+      "step": 1972
+    },
+    {
+      "epoch": 0.5261333333333333,
+      "grad_norm": 0.36388608450533694,
+      "learning_rate": 9.64160523974411e-05,
+      "loss": 0.6372,
+      "step": 1973
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.3608893800242963,
+      "learning_rate": 9.632973055666611e-05,
+      "loss": 0.6066,
+      "step": 1974
+    },
+    {
+      "epoch": 0.5266666666666666,
+      "grad_norm": 0.3542234429582837,
+      "learning_rate": 9.624341145438053e-05,
+      "loss": 0.6021,
+      "step": 1975
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.3646966851994873,
+      "learning_rate": 9.615709515498939e-05,
+      "loss": 0.6431,
+      "step": 1976
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.3567920875456539,
+      "learning_rate": 9.607078172289569e-05,
+      "loss": 0.6043,
+      "step": 1977
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.3455903642761181,
+      "learning_rate": 9.598447122250029e-05,
+      "loss": 0.5914,
+      "step": 1978
+    },
+    {
+      "epoch": 0.5277333333333334,
+      "grad_norm": 0.3350469040038476,
+      "learning_rate": 9.589816371820179e-05,
+      "loss": 0.6445,
+      "step": 1979
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.33382522064086734,
+      "learning_rate": 9.581185927439665e-05,
+      "loss": 0.6278,
+      "step": 1980
+    },
+    {
+      "epoch": 0.5282666666666667,
+      "grad_norm": 0.34817099587173167,
+      "learning_rate": 9.572555795547896e-05,
+      "loss": 0.5998,
+      "step": 1981
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.34045618454381377,
+      "learning_rate": 9.563925982584054e-05,
+      "loss": 0.6376,
+      "step": 1982
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.37164786998146804,
+      "learning_rate": 9.555296494987083e-05,
+      "loss": 0.6673,
+      "step": 1983
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.34117451599058546,
+      "learning_rate": 9.546667339195678e-05,
+      "loss": 0.6478,
+      "step": 1984
+    },
+    {
+      "epoch": 0.5293333333333333,
+      "grad_norm": 0.32393108934758247,
+      "learning_rate": 9.53803852164829e-05,
+      "loss": 0.6027,
+      "step": 1985
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3563116464426639,
+      "learning_rate": 9.529410048783119e-05,
+      "loss": 0.6435,
+      "step": 1986
+    },
+    {
+      "epoch": 0.5298666666666667,
+      "grad_norm": 0.33473710068350665,
+      "learning_rate": 9.520781927038111e-05,
+      "loss": 0.6069,
+      "step": 1987
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.3528834640270041,
+      "learning_rate": 9.51215416285094e-05,
+      "loss": 0.6205,
+      "step": 1988
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.3465769571711552,
+      "learning_rate": 9.503526762659023e-05,
+      "loss": 0.6531,
+      "step": 1989
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.33870023059505733,
+      "learning_rate": 9.4948997328995e-05,
+      "loss": 0.5928,
+      "step": 1990
+    },
+    {
+      "epoch": 0.5309333333333334,
+      "grad_norm": 0.3408493516549377,
+      "learning_rate": 9.486273080009238e-05,
+      "loss": 0.5853,
+      "step": 1991
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3429692651375918,
+      "learning_rate": 9.47764681042482e-05,
+      "loss": 0.63,
+      "step": 1992
+    },
+    {
+      "epoch": 0.5314666666666666,
+      "grad_norm": 0.3318651483214718,
+      "learning_rate": 9.46902093058254e-05,
+      "loss": 0.5924,
+      "step": 1993
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.3586569129780557,
+      "learning_rate": 9.460395446918412e-05,
+      "loss": 0.6612,
+      "step": 1994
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.36558670340223703,
+      "learning_rate": 9.451770365868143e-05,
+      "loss": 0.6199,
+      "step": 1995
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.3570630351766155,
+      "learning_rate": 9.443145693867145e-05,
+      "loss": 0.612,
+      "step": 1996
+    },
+    {
+      "epoch": 0.5325333333333333,
+      "grad_norm": 0.3442960505629333,
+      "learning_rate": 9.434521437350525e-05,
+      "loss": 0.5844,
+      "step": 1997
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3410095445432704,
+      "learning_rate": 9.425897602753083e-05,
+      "loss": 0.5924,
+      "step": 1998
+    },
+    {
+      "epoch": 0.5330666666666667,
+      "grad_norm": 0.34181677698600293,
+      "learning_rate": 9.417274196509289e-05,
+      "loss": 0.6081,
+      "step": 1999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.33956914009237654,
+      "learning_rate": 9.408651225053314e-05,
+      "loss": 0.5726,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.3298280497269895,
+      "learning_rate": 9.400028694818992e-05,
+      "loss": 0.6169,
+      "step": 2001
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.344816811209219,
+      "learning_rate": 9.39140661223983e-05,
+      "loss": 0.5984,
+      "step": 2002
+    },
+    {
+      "epoch": 0.5341333333333333,
+      "grad_norm": 0.3372514333415848,
+      "learning_rate": 9.382784983749005e-05,
+      "loss": 0.5868,
+      "step": 2003
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3474693000685869,
+      "learning_rate": 9.37416381577935e-05,
+      "loss": 0.5806,
+      "step": 2004
+    },
+    {
+      "epoch": 0.5346666666666666,
+      "grad_norm": 0.35926899358715614,
+      "learning_rate": 9.365543114763357e-05,
+      "loss": 0.6578,
+      "step": 2005
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.3271872494616578,
+      "learning_rate": 9.356922887133173e-05,
+      "loss": 0.6303,
+      "step": 2006
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.33368318359185056,
+      "learning_rate": 9.34830313932058e-05,
+      "loss": 0.5801,
+      "step": 2007
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.3394449024892054,
+      "learning_rate": 9.339683877757014e-05,
+      "loss": 0.6069,
+      "step": 2008
+    },
+    {
+      "epoch": 0.5357333333333333,
+      "grad_norm": 0.340512022100245,
+      "learning_rate": 9.331065108873543e-05,
+      "loss": 0.6031,
+      "step": 2009
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3544515933008161,
+      "learning_rate": 9.322446839100869e-05,
+      "loss": 0.6796,
+      "step": 2010
+    },
+    {
+      "epoch": 0.5362666666666667,
+      "grad_norm": 0.34520332137560916,
+      "learning_rate": 9.313829074869323e-05,
+      "loss": 0.6136,
+      "step": 2011
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.3278816100917621,
+      "learning_rate": 9.305211822608856e-05,
+      "loss": 0.5916,
+      "step": 2012
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.3388663985330351,
+      "learning_rate": 9.296595088749036e-05,
+      "loss": 0.6183,
+      "step": 2013
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.4020491790596033,
+      "learning_rate": 9.287978879719053e-05,
+      "loss": 0.6291,
+      "step": 2014
+    },
+    {
+      "epoch": 0.5373333333333333,
+      "grad_norm": 0.5624815817379968,
+      "learning_rate": 9.279363201947689e-05,
+      "loss": 0.6057,
+      "step": 2015
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.35953229762593675,
+      "learning_rate": 9.270748061863344e-05,
+      "loss": 0.6232,
+      "step": 2016
+    },
+    {
+      "epoch": 0.5378666666666667,
+      "grad_norm": 0.32891363978155613,
+      "learning_rate": 9.262133465894009e-05,
+      "loss": 0.593,
+      "step": 2017
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.3518251086921895,
+      "learning_rate": 9.253519420467275e-05,
+      "loss": 0.622,
+      "step": 2018
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.3643982126011602,
+      "learning_rate": 9.244905932010319e-05,
+      "loss": 0.6277,
+      "step": 2019
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.3413844994342637,
+      "learning_rate": 9.2362930069499e-05,
+      "loss": 0.6549,
+      "step": 2020
+    },
+    {
+      "epoch": 0.5389333333333334,
+      "grad_norm": 0.33484175894642043,
+      "learning_rate": 9.227680651712362e-05,
+      "loss": 0.5621,
+      "step": 2021
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3308715436992413,
+      "learning_rate": 9.219068872723625e-05,
+      "loss": 0.5836,
+      "step": 2022
+    },
+    {
+      "epoch": 0.5394666666666666,
+      "grad_norm": 0.35692259824731815,
+      "learning_rate": 9.210457676409167e-05,
+      "loss": 0.6251,
+      "step": 2023
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.3375423674679303,
+      "learning_rate": 9.201847069194043e-05,
+      "loss": 0.6206,
+      "step": 2024
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.41866905304563634,
+      "learning_rate": 9.193237057502864e-05,
+      "loss": 0.6342,
+      "step": 2025
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.334719998891011,
+      "learning_rate": 9.184627647759799e-05,
+      "loss": 0.6084,
+      "step": 2026
+    },
+    {
+      "epoch": 0.5405333333333333,
+      "grad_norm": 0.3385689320503021,
+      "learning_rate": 9.176018846388565e-05,
+      "loss": 0.6281,
+      "step": 2027
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.36206757130750844,
+      "learning_rate": 9.167410659812428e-05,
+      "loss": 0.6346,
+      "step": 2028
+    },
+    {
+      "epoch": 0.5410666666666667,
+      "grad_norm": 0.3541779180060866,
+      "learning_rate": 9.158803094454192e-05,
+      "loss": 0.5989,
+      "step": 2029
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.3251690401987799,
+      "learning_rate": 9.150196156736203e-05,
+      "loss": 0.6215,
+      "step": 2030
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.3453497082769123,
+      "learning_rate": 9.14158985308033e-05,
+      "loss": 0.6428,
+      "step": 2031
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.3165127728007757,
+      "learning_rate": 9.132984189907975e-05,
+      "loss": 0.5871,
+      "step": 2032
+    },
+    {
+      "epoch": 0.5421333333333334,
+      "grad_norm": 0.37383934724822754,
+      "learning_rate": 9.124379173640064e-05,
+      "loss": 0.5954,
+      "step": 2033
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.343389799835649,
+      "learning_rate": 9.115774810697034e-05,
+      "loss": 0.5961,
+      "step": 2034
+    },
+    {
+      "epoch": 0.5426666666666666,
+      "grad_norm": 0.3576745888356298,
+      "learning_rate": 9.107171107498838e-05,
+      "loss": 0.6831,
+      "step": 2035
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.3600900883948002,
+      "learning_rate": 9.09856807046494e-05,
+      "loss": 0.6257,
+      "step": 2036
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.36104787294082813,
+      "learning_rate": 9.089965706014301e-05,
+      "loss": 0.5985,
+      "step": 2037
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.3541612746079401,
+      "learning_rate": 9.081364020565383e-05,
+      "loss": 0.5988,
+      "step": 2038
+    },
+    {
+      "epoch": 0.5437333333333333,
+      "grad_norm": 0.37535187068876136,
+      "learning_rate": 9.07276302053614e-05,
+      "loss": 0.6278,
+      "step": 2039
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3421339200864777,
+      "learning_rate": 9.064162712344015e-05,
+      "loss": 0.5847,
+      "step": 2040
+    },
+    {
+      "epoch": 0.5442666666666667,
+      "grad_norm": 0.3403665462535796,
+      "learning_rate": 9.05556310240594e-05,
+      "loss": 0.619,
+      "step": 2041
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.3393418599217958,
+      "learning_rate": 9.046964197138316e-05,
+      "loss": 0.6417,
+      "step": 2042
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.33337811114109767,
+      "learning_rate": 9.038366002957028e-05,
+      "loss": 0.566,
+      "step": 2043
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.33068332423472757,
+      "learning_rate": 9.029768526277424e-05,
+      "loss": 0.6178,
+      "step": 2044
+    },
+    {
+      "epoch": 0.5453333333333333,
+      "grad_norm": 0.34597647082344846,
+      "learning_rate": 9.02117177351432e-05,
+      "loss": 0.6075,
+      "step": 2045
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.343550031589001,
+      "learning_rate": 9.012575751081991e-05,
+      "loss": 0.6398,
+      "step": 2046
+    },
+    {
+      "epoch": 0.5458666666666666,
+      "grad_norm": 0.33303193389503644,
+      "learning_rate": 9.003980465394165e-05,
+      "loss": 0.5492,
+      "step": 2047
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.3638930536720473,
+      "learning_rate": 8.995385922864021e-05,
+      "loss": 0.5737,
+      "step": 2048
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.3469533174425106,
+      "learning_rate": 8.986792129904186e-05,
+      "loss": 0.6275,
+      "step": 2049
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.3377209067699381,
+      "learning_rate": 8.978199092926727e-05,
+      "loss": 0.5673,
+      "step": 2050
+    },
+    {
+      "epoch": 0.5469333333333334,
+      "grad_norm": 0.34722544607728356,
+      "learning_rate": 8.969606818343147e-05,
+      "loss": 0.6335,
+      "step": 2051
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.31719714612185756,
+      "learning_rate": 8.961015312564377e-05,
+      "loss": 0.5459,
+      "step": 2052
+    },
+    {
+      "epoch": 0.5474666666666667,
+      "grad_norm": 0.3330824647927296,
+      "learning_rate": 8.952424582000783e-05,
+      "loss": 0.6243,
+      "step": 2053
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.347092057709546,
+      "learning_rate": 8.943834633062136e-05,
+      "loss": 0.6217,
+      "step": 2054
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.3284139348614484,
+      "learning_rate": 8.935245472157639e-05,
+      "loss": 0.602,
+      "step": 2055
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.3402004932097726,
+      "learning_rate": 8.926657105695903e-05,
+      "loss": 0.5998,
+      "step": 2056
+    },
+    {
+      "epoch": 0.5485333333333333,
+      "grad_norm": 0.315831128436594,
+      "learning_rate": 8.918069540084946e-05,
+      "loss": 0.5496,
+      "step": 2057
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.3376938423670618,
+      "learning_rate": 8.909482781732186e-05,
+      "loss": 0.6063,
+      "step": 2058
+    },
+    {
+      "epoch": 0.5490666666666667,
+      "grad_norm": 0.39995440990181685,
+      "learning_rate": 8.900896837044442e-05,
+      "loss": 0.6427,
+      "step": 2059
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.33405138270560814,
+      "learning_rate": 8.892311712427923e-05,
+      "loss": 0.59,
+      "step": 2060
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.37310467131968905,
+      "learning_rate": 8.883727414288235e-05,
+      "loss": 0.5921,
+      "step": 2061
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.34440676481003174,
+      "learning_rate": 8.875143949030346e-05,
+      "loss": 0.6549,
+      "step": 2062
+    },
+    {
+      "epoch": 0.5501333333333334,
+      "grad_norm": 0.34814487185459875,
+      "learning_rate": 8.866561323058627e-05,
+      "loss": 0.5869,
+      "step": 2063
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.35022252482919664,
+      "learning_rate": 8.857979542776808e-05,
+      "loss": 0.5877,
+      "step": 2064
+    },
+    {
+      "epoch": 0.5506666666666666,
+      "grad_norm": 0.37179646007477163,
+      "learning_rate": 8.849398614587993e-05,
+      "loss": 0.6344,
+      "step": 2065
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.33317276543374524,
+      "learning_rate": 8.840818544894648e-05,
+      "loss": 0.5623,
+      "step": 2066
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.37191598469618325,
+      "learning_rate": 8.832239340098605e-05,
+      "loss": 0.6293,
+      "step": 2067
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.3789732328180415,
+      "learning_rate": 8.823661006601042e-05,
+      "loss": 0.6597,
+      "step": 2068
+    },
+    {
+      "epoch": 0.5517333333333333,
+      "grad_norm": 0.36215763819533425,
+      "learning_rate": 8.815083550802495e-05,
+      "loss": 0.5935,
+      "step": 2069
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.32754612704059777,
+      "learning_rate": 8.806506979102834e-05,
+      "loss": 0.6207,
+      "step": 2070
+    },
+    {
+      "epoch": 0.5522666666666667,
+      "grad_norm": 0.3499504458622124,
+      "learning_rate": 8.797931297901276e-05,
+      "loss": 0.6029,
+      "step": 2071
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.32985846583801026,
+      "learning_rate": 8.789356513596379e-05,
+      "loss": 0.5792,
+      "step": 2072
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.35501097287953565,
+      "learning_rate": 8.780782632586023e-05,
+      "loss": 0.6215,
+      "step": 2073
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.3641781892214901,
+      "learning_rate": 8.772209661267418e-05,
+      "loss": 0.5936,
+      "step": 2074
+    },
+    {
+      "epoch": 0.5533333333333333,
+      "grad_norm": 0.3497094810195038,
+      "learning_rate": 8.763637606037097e-05,
+      "loss": 0.5721,
+      "step": 2075
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.35925238926527614,
+      "learning_rate": 8.755066473290904e-05,
+      "loss": 0.6171,
+      "step": 2076
+    },
+    {
+      "epoch": 0.5538666666666666,
+      "grad_norm": 0.37807311052864306,
+      "learning_rate": 8.746496269423999e-05,
+      "loss": 0.6154,
+      "step": 2077
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.327883030793197,
+      "learning_rate": 8.737927000830848e-05,
+      "loss": 0.5958,
+      "step": 2078
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.36752250218691057,
+      "learning_rate": 8.729358673905218e-05,
+      "loss": 0.6579,
+      "step": 2079
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.33357704669776383,
+      "learning_rate": 8.720791295040175e-05,
+      "loss": 0.5738,
+      "step": 2080
+    },
+    {
+      "epoch": 0.5549333333333333,
+      "grad_norm": 0.35818551778045515,
+      "learning_rate": 8.712224870628077e-05,
+      "loss": 0.6508,
+      "step": 2081
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3289981692128152,
+      "learning_rate": 8.703659407060571e-05,
+      "loss": 0.5745,
+      "step": 2082
+    },
+    {
+      "epoch": 0.5554666666666667,
+      "grad_norm": 0.3341807345759168,
+      "learning_rate": 8.695094910728583e-05,
+      "loss": 0.5978,
+      "step": 2083
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.351362014077988,
+      "learning_rate": 8.686531388022325e-05,
+      "loss": 0.6684,
+      "step": 2084
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.3325468561360648,
+      "learning_rate": 8.677968845331274e-05,
+      "loss": 0.5998,
+      "step": 2085
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.35532846758096376,
+      "learning_rate": 8.66940728904418e-05,
+      "loss": 0.582,
+      "step": 2086
+    },
+    {
+      "epoch": 0.5565333333333333,
+      "grad_norm": 0.34063513501893017,
+      "learning_rate": 8.660846725549056e-05,
+      "loss": 0.621,
+      "step": 2087
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.33280622979296764,
+      "learning_rate": 8.652287161233178e-05,
+      "loss": 0.5744,
+      "step": 2088
+    },
+    {
+      "epoch": 0.5570666666666667,
+      "grad_norm": 0.3347037433758992,
+      "learning_rate": 8.64372860248307e-05,
+      "loss": 0.6376,
+      "step": 2089
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.3507905705327085,
+      "learning_rate": 8.635171055684511e-05,
+      "loss": 0.6011,
+      "step": 2090
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.3438632021902035,
+      "learning_rate": 8.626614527222523e-05,
+      "loss": 0.604,
+      "step": 2091
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.3424938685333037,
+      "learning_rate": 8.618059023481368e-05,
+      "loss": 0.6155,
+      "step": 2092
+    },
+    {
+      "epoch": 0.5581333333333334,
+      "grad_norm": 0.32896797668196776,
+      "learning_rate": 8.609504550844542e-05,
+      "loss": 0.5793,
+      "step": 2093
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.33911151122399524,
+      "learning_rate": 8.600951115694775e-05,
+      "loss": 0.601,
+      "step": 2094
+    },
+    {
+      "epoch": 0.5586666666666666,
+      "grad_norm": 0.3334465272246177,
+      "learning_rate": 8.592398724414021e-05,
+      "loss": 0.5915,
+      "step": 2095
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.3460845438683898,
+      "learning_rate": 8.583847383383454e-05,
+      "loss": 0.6313,
+      "step": 2096
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.33897001285361567,
+      "learning_rate": 8.575297098983468e-05,
+      "loss": 0.5973,
+      "step": 2097
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.3377652251865536,
+      "learning_rate": 8.566747877593665e-05,
+      "loss": 0.5772,
+      "step": 2098
+    },
+    {
+      "epoch": 0.5597333333333333,
+      "grad_norm": 0.3509326998534721,
+      "learning_rate": 8.558199725592855e-05,
+      "loss": 0.6192,
+      "step": 2099
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3350008388768927,
+      "learning_rate": 8.549652649359053e-05,
+      "loss": 0.6165,
+      "step": 2100
+    },
+    {
+      "epoch": 0.5602666666666667,
+      "grad_norm": 0.35185785923586005,
+      "learning_rate": 8.541106655269464e-05,
+      "loss": 0.6589,
+      "step": 2101
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.3641117264412489,
+      "learning_rate": 8.532561749700493e-05,
+      "loss": 0.6001,
+      "step": 2102
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.3624688666585318,
+      "learning_rate": 8.524017939027728e-05,
+      "loss": 0.6389,
+      "step": 2103
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.3367140167577722,
+      "learning_rate": 8.515475229625946e-05,
+      "loss": 0.6106,
+      "step": 2104
+    },
+    {
+      "epoch": 0.5613333333333334,
+      "grad_norm": 0.36035269416188537,
+      "learning_rate": 8.506933627869095e-05,
+      "loss": 0.598,
+      "step": 2105
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3704014589770776,
+      "learning_rate": 8.4983931401303e-05,
+      "loss": 0.5981,
+      "step": 2106
+    },
+    {
+      "epoch": 0.5618666666666666,
+      "grad_norm": 0.35576048619854,
+      "learning_rate": 8.489853772781857e-05,
+      "loss": 0.6635,
+      "step": 2107
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.3411775664349861,
+      "learning_rate": 8.481315532195227e-05,
+      "loss": 0.5762,
+      "step": 2108
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.361268949194407,
+      "learning_rate": 8.47277842474102e-05,
+      "loss": 0.6241,
+      "step": 2109
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.3383370950428697,
+      "learning_rate": 8.464242456789009e-05,
+      "loss": 0.6006,
+      "step": 2110
+    },
+    {
+      "epoch": 0.5629333333333333,
+      "grad_norm": 0.33067600513284773,
+      "learning_rate": 8.455707634708115e-05,
+      "loss": 0.5979,
+      "step": 2111
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3372671832014587,
+      "learning_rate": 8.447173964866408e-05,
+      "loss": 0.6467,
+      "step": 2112
+    },
+    {
+      "epoch": 0.5634666666666667,
+      "grad_norm": 0.3517759381980163,
+      "learning_rate": 8.438641453631093e-05,
+      "loss": 0.6049,
+      "step": 2113
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.33766488973058173,
+      "learning_rate": 8.430110107368513e-05,
+      "loss": 0.6178,
+      "step": 2114
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.33149697066517503,
+      "learning_rate": 8.421579932444145e-05,
+      "loss": 0.5679,
+      "step": 2115
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.35216142432794484,
+      "learning_rate": 8.41305093522258e-05,
+      "loss": 0.6352,
+      "step": 2116
+    },
+    {
+      "epoch": 0.5645333333333333,
+      "grad_norm": 0.33587153980527745,
+      "learning_rate": 8.40452312206754e-05,
+      "loss": 0.6041,
+      "step": 2117
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.4621106400250287,
+      "learning_rate": 8.395996499341866e-05,
+      "loss": 0.6434,
+      "step": 2118
+    },
+    {
+      "epoch": 0.5650666666666667,
+      "grad_norm": 0.34685449138666086,
+      "learning_rate": 8.387471073407503e-05,
+      "loss": 0.627,
+      "step": 2119
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.3250697411138274,
+      "learning_rate": 8.378946850625509e-05,
+      "loss": 0.5911,
+      "step": 2120
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.33521477785009896,
+      "learning_rate": 8.37042383735604e-05,
+      "loss": 0.616,
+      "step": 2121
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.3556731304369497,
+      "learning_rate": 8.361902039958355e-05,
+      "loss": 0.626,
+      "step": 2122
+    },
+    {
+      "epoch": 0.5661333333333334,
+      "grad_norm": 0.35989902149630715,
+      "learning_rate": 8.353381464790805e-05,
+      "loss": 0.6547,
+      "step": 2123
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3633470323367122,
+      "learning_rate": 8.344862118210817e-05,
+      "loss": 0.6093,
+      "step": 2124
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 0.3643828983267928,
+      "learning_rate": 8.336344006574916e-05,
+      "loss": 0.6388,
+      "step": 2125
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.36020589860079016,
+      "learning_rate": 8.3278271362387e-05,
+      "loss": 0.5954,
+      "step": 2126
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.36464990739190356,
+      "learning_rate": 8.319311513556841e-05,
+      "loss": 0.6031,
+      "step": 2127
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.3459771593329154,
+      "learning_rate": 8.310797144883082e-05,
+      "loss": 0.6408,
+      "step": 2128
+    },
+    {
+      "epoch": 0.5677333333333333,
+      "grad_norm": 0.34126907606757023,
+      "learning_rate": 8.302284036570224e-05,
+      "loss": 0.591,
+      "step": 2129
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.34335754276681585,
+      "learning_rate": 8.293772194970138e-05,
+      "loss": 0.5769,
+      "step": 2130
+    },
+    {
+      "epoch": 0.5682666666666667,
+      "grad_norm": 0.3357509163461733,
+      "learning_rate": 8.285261626433742e-05,
+      "loss": 0.5576,
+      "step": 2131
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.3680848040216543,
+      "learning_rate": 8.276752337311006e-05,
+      "loss": 0.6542,
+      "step": 2132
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.3587450751389266,
+      "learning_rate": 8.268244333950942e-05,
+      "loss": 0.5724,
+      "step": 2133
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.32975765110137656,
+      "learning_rate": 8.259737622701613e-05,
+      "loss": 0.5774,
+      "step": 2134
+    },
+    {
+      "epoch": 0.5693333333333334,
+      "grad_norm": 0.35280175254086077,
+      "learning_rate": 8.251232209910105e-05,
+      "loss": 0.5782,
+      "step": 2135
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.32817665526899037,
+      "learning_rate": 8.242728101922547e-05,
+      "loss": 0.551,
+      "step": 2136
+    },
+    {
+      "epoch": 0.5698666666666666,
+      "grad_norm": 0.34550670703489517,
+      "learning_rate": 8.234225305084084e-05,
+      "loss": 0.5816,
+      "step": 2137
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.33027750539426437,
+      "learning_rate": 8.22572382573889e-05,
+      "loss": 0.6058,
+      "step": 2138
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.34543802263874496,
+      "learning_rate": 8.217223670230157e-05,
+      "loss": 0.6288,
+      "step": 2139
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.3164432690107782,
+      "learning_rate": 8.208724844900078e-05,
+      "loss": 0.5525,
+      "step": 2140
+    },
+    {
+      "epoch": 0.5709333333333333,
+      "grad_norm": 0.3591546450447273,
+      "learning_rate": 8.200227356089864e-05,
+      "loss": 0.6046,
+      "step": 2141
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3461150712033162,
+      "learning_rate": 8.19173121013973e-05,
+      "loss": 0.6221,
+      "step": 2142
+    },
+    {
+      "epoch": 0.5714666666666667,
+      "grad_norm": 0.3540330483209408,
+      "learning_rate": 8.183236413388881e-05,
+      "loss": 0.5995,
+      "step": 2143
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.3434326905690918,
+      "learning_rate": 8.174742972175522e-05,
+      "loss": 0.558,
+      "step": 2144
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.3546826651709987,
+      "learning_rate": 8.166250892836842e-05,
+      "loss": 0.6195,
+      "step": 2145
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.3438081514830633,
+      "learning_rate": 8.157760181709018e-05,
+      "loss": 0.5784,
+      "step": 2146
+    },
+    {
+      "epoch": 0.5725333333333333,
+      "grad_norm": 0.36915836141721475,
+      "learning_rate": 8.149270845127205e-05,
+      "loss": 0.6277,
+      "step": 2147
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.34105981741670754,
+      "learning_rate": 8.140782889425526e-05,
+      "loss": 0.6143,
+      "step": 2148
+    },
+    {
+      "epoch": 0.5730666666666666,
+      "grad_norm": 0.3540664120500646,
+      "learning_rate": 8.132296320937086e-05,
+      "loss": 0.6192,
+      "step": 2149
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.3532922710017726,
+      "learning_rate": 8.123811145993942e-05,
+      "loss": 0.6151,
+      "step": 2150
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.3370465509535198,
+      "learning_rate": 8.115327370927122e-05,
+      "loss": 0.5744,
+      "step": 2151
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.34363340810194093,
+      "learning_rate": 8.106845002066603e-05,
+      "loss": 0.5826,
+      "step": 2152
+    },
+    {
+      "epoch": 0.5741333333333334,
+      "grad_norm": 0.34154527633045134,
+      "learning_rate": 8.098364045741313e-05,
+      "loss": 0.5349,
+      "step": 2153
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3488485033367598,
+      "learning_rate": 8.089884508279135e-05,
+      "loss": 0.5748,
+      "step": 2154
+    },
+    {
+      "epoch": 0.5746666666666667,
+      "grad_norm": 0.3464583346288359,
+      "learning_rate": 8.081406396006877e-05,
+      "loss": 0.5931,
+      "step": 2155
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.3430607709714394,
+      "learning_rate": 8.072929715250293e-05,
+      "loss": 0.6386,
+      "step": 2156
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.4560599387658039,
+      "learning_rate": 8.064454472334076e-05,
+      "loss": 0.5888,
+      "step": 2157
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.3496810619046592,
+      "learning_rate": 8.055980673581832e-05,
+      "loss": 0.6148,
+      "step": 2158
+    },
+    {
+      "epoch": 0.5757333333333333,
+      "grad_norm": 0.3494353089823202,
+      "learning_rate": 8.047508325316102e-05,
+      "loss": 0.5859,
+      "step": 2159
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3651351507114001,
+      "learning_rate": 8.039037433858335e-05,
+      "loss": 0.6171,
+      "step": 2160
+    },
+    {
+      "epoch": 0.5762666666666667,
+      "grad_norm": 0.35926794960992114,
+      "learning_rate": 8.030568005528898e-05,
+      "loss": 0.5852,
+      "step": 2161
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.3622318625788015,
+      "learning_rate": 8.02210004664707e-05,
+      "loss": 0.5518,
+      "step": 2162
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.334224128949714,
+      "learning_rate": 8.01363356353102e-05,
+      "loss": 0.5938,
+      "step": 2163
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.3597879159381735,
+      "learning_rate": 8.00516856249783e-05,
+      "loss": 0.5784,
+      "step": 2164
+    },
+    {
+      "epoch": 0.5773333333333334,
+      "grad_norm": 0.3518818743418102,
+      "learning_rate": 7.996705049863471e-05,
+      "loss": 0.6022,
+      "step": 2165
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4021362296104498,
+      "learning_rate": 7.9882430319428e-05,
+      "loss": 0.5951,
+      "step": 2166
+    },
+    {
+      "epoch": 0.5778666666666666,
+      "grad_norm": 0.3488767362508129,
+      "learning_rate": 7.979782515049567e-05,
+      "loss": 0.5988,
+      "step": 2167
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.35626717974817695,
+      "learning_rate": 7.971323505496398e-05,
+      "loss": 0.622,
+      "step": 2168
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.3382006285897414,
+      "learning_rate": 7.96286600959479e-05,
+      "loss": 0.5864,
+      "step": 2169
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.34574066685301846,
+      "learning_rate": 7.95441003365512e-05,
+      "loss": 0.5302,
+      "step": 2170
+    },
+    {
+      "epoch": 0.5789333333333333,
+      "grad_norm": 0.3403052469166525,
+      "learning_rate": 7.945955583986617e-05,
+      "loss": 0.6384,
+      "step": 2171
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3612333352239432,
+      "learning_rate": 7.937502666897382e-05,
+      "loss": 0.6007,
+      "step": 2172
+    },
+    {
+      "epoch": 0.5794666666666667,
+      "grad_norm": 0.3389929214890273,
+      "learning_rate": 7.929051288694374e-05,
+      "loss": 0.5707,
+      "step": 2173
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.3398935765805274,
+      "learning_rate": 7.920601455683394e-05,
+      "loss": 0.5797,
+      "step": 2174
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.36536918064987484,
+      "learning_rate": 7.912153174169099e-05,
+      "loss": 0.6521,
+      "step": 2175
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.3435878045685125,
+      "learning_rate": 7.903706450454986e-05,
+      "loss": 0.5548,
+      "step": 2176
+    },
+    {
+      "epoch": 0.5805333333333333,
+      "grad_norm": 0.363125914272866,
+      "learning_rate": 7.895261290843386e-05,
+      "loss": 0.6277,
+      "step": 2177
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.3328225446299371,
+      "learning_rate": 7.886817701635472e-05,
+      "loss": 0.5689,
+      "step": 2178
+    },
+    {
+      "epoch": 0.5810666666666666,
+      "grad_norm": 0.3475643192861449,
+      "learning_rate": 7.878375689131232e-05,
+      "loss": 0.619,
+      "step": 2179
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.351111945046791,
+      "learning_rate": 7.869935259629485e-05,
+      "loss": 0.5851,
+      "step": 2180
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.3874561839635163,
+      "learning_rate": 7.861496419427872e-05,
+      "loss": 0.6405,
+      "step": 2181
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.3528461362308666,
+      "learning_rate": 7.853059174822844e-05,
+      "loss": 0.6063,
+      "step": 2182
+    },
+    {
+      "epoch": 0.5821333333333333,
+      "grad_norm": 0.3368603956170215,
+      "learning_rate": 7.844623532109662e-05,
+      "loss": 0.6353,
+      "step": 2183
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.36051823714180337,
+      "learning_rate": 7.836189497582391e-05,
+      "loss": 0.5983,
+      "step": 2184
+    },
+    {
+      "epoch": 0.5826666666666667,
+      "grad_norm": 0.3574779139123841,
+      "learning_rate": 7.827757077533899e-05,
+      "loss": 0.5696,
+      "step": 2185
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.3647603543025058,
+      "learning_rate": 7.819326278255848e-05,
+      "loss": 0.6073,
+      "step": 2186
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.32516747922540085,
+      "learning_rate": 7.810897106038686e-05,
+      "loss": 0.56,
+      "step": 2187
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.343663514568444,
+      "learning_rate": 7.802469567171655e-05,
+      "loss": 0.574,
+      "step": 2188
+    },
+    {
+      "epoch": 0.5837333333333333,
+      "grad_norm": 0.5104045087718205,
+      "learning_rate": 7.794043667942771e-05,
+      "loss": 0.6023,
+      "step": 2189
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3855855411473251,
+      "learning_rate": 7.785619414638835e-05,
+      "loss": 0.6187,
+      "step": 2190
+    },
+    {
+      "epoch": 0.5842666666666667,
+      "grad_norm": 0.3827575440516394,
+      "learning_rate": 7.777196813545413e-05,
+      "loss": 0.6275,
+      "step": 2191
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.3516594315954644,
+      "learning_rate": 7.768775870946837e-05,
+      "loss": 0.5734,
+      "step": 2192
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.3658062669534862,
+      "learning_rate": 7.760356593126211e-05,
+      "loss": 0.6167,
+      "step": 2193
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.35939595039550526,
+      "learning_rate": 7.751938986365385e-05,
+      "loss": 0.6175,
+      "step": 2194
+    },
+    {
+      "epoch": 0.5853333333333334,
+      "grad_norm": 0.3600678656083925,
+      "learning_rate": 7.743523056944972e-05,
+      "loss": 0.6298,
+      "step": 2195
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3414781212657749,
+      "learning_rate": 7.735108811144326e-05,
+      "loss": 0.6289,
+      "step": 2196
+    },
+    {
+      "epoch": 0.5858666666666666,
+      "grad_norm": 0.33506940917704947,
+      "learning_rate": 7.72669625524155e-05,
+      "loss": 0.5857,
+      "step": 2197
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.35039664016573224,
+      "learning_rate": 7.718285395513484e-05,
+      "loss": 0.6238,
+      "step": 2198
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.34707107508845275,
+      "learning_rate": 7.709876238235703e-05,
+      "loss": 0.621,
+      "step": 2199
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.34285251310935566,
+      "learning_rate": 7.70146878968251e-05,
+      "loss": 0.588,
+      "step": 2200
+    },
+    {
+      "epoch": 0.5869333333333333,
+      "grad_norm": 0.3602041822763401,
+      "learning_rate": 7.693063056126942e-05,
+      "loss": 0.5894,
+      "step": 2201
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.38263370361777826,
+      "learning_rate": 7.684659043840737e-05,
+      "loss": 0.6241,
+      "step": 2202
+    },
+    {
+      "epoch": 0.5874666666666667,
+      "grad_norm": 0.33442803356311723,
+      "learning_rate": 7.67625675909437e-05,
+      "loss": 0.6041,
+      "step": 2203
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.34271316136905655,
+      "learning_rate": 7.66785620815701e-05,
+      "loss": 0.5695,
+      "step": 2204
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.34271844060551837,
+      "learning_rate": 7.659457397296548e-05,
+      "loss": 0.5719,
+      "step": 2205
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.34074197334029466,
+      "learning_rate": 7.651060332779563e-05,
+      "loss": 0.5978,
+      "step": 2206
+    },
+    {
+      "epoch": 0.5885333333333334,
+      "grad_norm": 0.35728991493505907,
+      "learning_rate": 7.642665020871338e-05,
+      "loss": 0.6017,
+      "step": 2207
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.34520399217730874,
+      "learning_rate": 7.634271467835851e-05,
+      "loss": 0.5912,
+      "step": 2208
+    },
+    {
+      "epoch": 0.5890666666666666,
+      "grad_norm": 0.348476666315004,
+      "learning_rate": 7.625879679935763e-05,
+      "loss": 0.6316,
+      "step": 2209
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.36361657121293595,
+      "learning_rate": 7.617489663432413e-05,
+      "loss": 0.6447,
+      "step": 2210
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.3597637793311185,
+      "learning_rate": 7.609101424585825e-05,
+      "loss": 0.5939,
+      "step": 2211
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.3539296488483343,
+      "learning_rate": 7.6007149696547e-05,
+      "loss": 0.5899,
+      "step": 2212
+    },
+    {
+      "epoch": 0.5901333333333333,
+      "grad_norm": 0.34309453582511523,
+      "learning_rate": 7.592330304896403e-05,
+      "loss": 0.6185,
+      "step": 2213
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3453739169812385,
+      "learning_rate": 7.58394743656696e-05,
+      "loss": 0.6203,
+      "step": 2214
+    },
+    {
+      "epoch": 0.5906666666666667,
+      "grad_norm": 0.35424522174748785,
+      "learning_rate": 7.575566370921066e-05,
+      "loss": 0.6111,
+      "step": 2215
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.35348887127339396,
+      "learning_rate": 7.567187114212061e-05,
+      "loss": 0.5487,
+      "step": 2216
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.3378762669998194,
+      "learning_rate": 7.558809672691947e-05,
+      "loss": 0.5828,
+      "step": 2217
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.33797924960148396,
+      "learning_rate": 7.550434052611355e-05,
+      "loss": 0.5678,
+      "step": 2218
+    },
+    {
+      "epoch": 0.5917333333333333,
+      "grad_norm": 0.3622053544874109,
+      "learning_rate": 7.54206026021957e-05,
+      "loss": 0.6029,
+      "step": 2219
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3487309719253174,
+      "learning_rate": 7.533688301764511e-05,
+      "loss": 0.618,
+      "step": 2220
+    },
+    {
+      "epoch": 0.5922666666666667,
+      "grad_norm": 0.36616511813036934,
+      "learning_rate": 7.525318183492726e-05,
+      "loss": 0.6237,
+      "step": 2221
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.4141903251524446,
+      "learning_rate": 7.516949911649391e-05,
+      "loss": 0.6252,
+      "step": 2222
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.34990643988554515,
+      "learning_rate": 7.50858349247831e-05,
+      "loss": 0.5596,
+      "step": 2223
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.3456055692611483,
+      "learning_rate": 7.500218932221892e-05,
+      "loss": 0.5629,
+      "step": 2224
+    },
+    {
+      "epoch": 0.5933333333333334,
+      "grad_norm": 0.34487246313600933,
+      "learning_rate": 7.491856237121175e-05,
+      "loss": 0.5487,
+      "step": 2225
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.34839659077446355,
+      "learning_rate": 7.483495413415788e-05,
+      "loss": 0.6425,
+      "step": 2226
+    },
+    {
+      "epoch": 0.5938666666666667,
+      "grad_norm": 0.33947982540845745,
+      "learning_rate": 7.475136467343978e-05,
+      "loss": 0.5738,
+      "step": 2227
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.3747753029085932,
+      "learning_rate": 7.46677940514258e-05,
+      "loss": 0.5928,
+      "step": 2228
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.33938024616010914,
+      "learning_rate": 7.458424233047036e-05,
+      "loss": 0.5793,
+      "step": 2229
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.33322286846915244,
+      "learning_rate": 7.450070957291366e-05,
+      "loss": 0.5961,
+      "step": 2230
+    },
+    {
+      "epoch": 0.5949333333333333,
+      "grad_norm": 0.33252742935298535,
+      "learning_rate": 7.441719584108181e-05,
+      "loss": 0.5845,
+      "step": 2231
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3271666683868001,
+      "learning_rate": 7.433370119728673e-05,
+      "loss": 0.5775,
+      "step": 2232
+    },
+    {
+      "epoch": 0.5954666666666667,
+      "grad_norm": 0.36118602544458317,
+      "learning_rate": 7.425022570382605e-05,
+      "loss": 0.6596,
+      "step": 2233
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.3277043497038271,
+      "learning_rate": 7.416676942298314e-05,
+      "loss": 0.5679,
+      "step": 2234
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.3274622490086294,
+      "learning_rate": 7.408333241702705e-05,
+      "loss": 0.5747,
+      "step": 2235
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.3506365359340622,
+      "learning_rate": 7.399991474821243e-05,
+      "loss": 0.6231,
+      "step": 2236
+    },
+    {
+      "epoch": 0.5965333333333334,
+      "grad_norm": 0.32990538416292725,
+      "learning_rate": 7.391651647877953e-05,
+      "loss": 0.5954,
+      "step": 2237
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.33803132565287664,
+      "learning_rate": 7.383313767095407e-05,
+      "loss": 0.5949,
+      "step": 2238
+    },
+    {
+      "epoch": 0.5970666666666666,
+      "grad_norm": 0.357174157769741,
+      "learning_rate": 7.374977838694729e-05,
+      "loss": 0.6293,
+      "step": 2239
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.3316359422424292,
+      "learning_rate": 7.366643868895588e-05,
+      "loss": 0.6013,
+      "step": 2240
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.31772140620864137,
+      "learning_rate": 7.358311863916187e-05,
+      "loss": 0.573,
+      "step": 2241
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.3404401326089692,
+      "learning_rate": 7.349981829973263e-05,
+      "loss": 0.59,
+      "step": 2242
+    },
+    {
+      "epoch": 0.5981333333333333,
+      "grad_norm": 0.38114877266967306,
+      "learning_rate": 7.341653773282085e-05,
+      "loss": 0.5803,
+      "step": 2243
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3354450653736003,
+      "learning_rate": 7.333327700056449e-05,
+      "loss": 0.5496,
+      "step": 2244
+    },
+    {
+      "epoch": 0.5986666666666667,
+      "grad_norm": 0.3446261131581236,
+      "learning_rate": 7.325003616508666e-05,
+      "loss": 0.5994,
+      "step": 2245
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.35349347990876445,
+      "learning_rate": 7.316681528849566e-05,
+      "loss": 0.5803,
+      "step": 2246
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.3824222967935294,
+      "learning_rate": 7.308361443288488e-05,
+      "loss": 0.5958,
+      "step": 2247
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.3437914532851528,
+      "learning_rate": 7.300043366033278e-05,
+      "loss": 0.5816,
+      "step": 2248
+    },
+    {
+      "epoch": 0.5997333333333333,
+      "grad_norm": 0.36128960166606017,
+      "learning_rate": 7.29172730329028e-05,
+      "loss": 0.6084,
+      "step": 2249
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.34836834337392336,
+      "learning_rate": 7.283413261264342e-05,
+      "loss": 0.6035,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6002666666666666,
+      "grad_norm": 0.3329353260047149,
+      "learning_rate": 7.275101246158798e-05,
+      "loss": 0.6099,
+      "step": 2251
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.3844371500453326,
+      "learning_rate": 7.266791264175473e-05,
+      "loss": 0.6055,
+      "step": 2252
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.35335087867196224,
+      "learning_rate": 7.258483321514673e-05,
+      "loss": 0.575,
+      "step": 2253
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.42308983932303557,
+      "learning_rate": 7.250177424375186e-05,
+      "loss": 0.5996,
+      "step": 2254
+    },
+    {
+      "epoch": 0.6013333333333334,
+      "grad_norm": 0.32822661451514334,
+      "learning_rate": 7.241873578954271e-05,
+      "loss": 0.5797,
+      "step": 2255
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.35055550159750304,
+      "learning_rate": 7.233571791447656e-05,
+      "loss": 0.6495,
+      "step": 2256
+    },
+    {
+      "epoch": 0.6018666666666667,
+      "grad_norm": 0.3666828839852818,
+      "learning_rate": 7.225272068049531e-05,
+      "loss": 0.6483,
+      "step": 2257
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.35008777305166217,
+      "learning_rate": 7.216974414952551e-05,
+      "loss": 0.6263,
+      "step": 2258
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.3392558823037353,
+      "learning_rate": 7.208678838347824e-05,
+      "loss": 0.5879,
+      "step": 2259
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.34162997908650017,
+      "learning_rate": 7.200385344424908e-05,
+      "loss": 0.613,
+      "step": 2260
+    },
+    {
+      "epoch": 0.6029333333333333,
+      "grad_norm": 0.35444806707422305,
+      "learning_rate": 7.19209393937181e-05,
+      "loss": 0.6412,
+      "step": 2261
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.3529659060309609,
+      "learning_rate": 7.183804629374974e-05,
+      "loss": 0.6089,
+      "step": 2262
+    },
+    {
+      "epoch": 0.6034666666666667,
+      "grad_norm": 0.3399626933463322,
+      "learning_rate": 7.175517420619287e-05,
+      "loss": 0.5967,
+      "step": 2263
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.35725601136877355,
+      "learning_rate": 7.167232319288063e-05,
+      "loss": 0.632,
+      "step": 2264
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.32530161909559985,
+      "learning_rate": 7.15894933156304e-05,
+      "loss": 0.5508,
+      "step": 2265
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.34345519645921707,
+      "learning_rate": 7.150668463624389e-05,
+      "loss": 0.6176,
+      "step": 2266
+    },
+    {
+      "epoch": 0.6045333333333334,
+      "grad_norm": 0.3209910978630537,
+      "learning_rate": 7.142389721650688e-05,
+      "loss": 0.5813,
+      "step": 2267
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3396483762825362,
+      "learning_rate": 7.134113111818943e-05,
+      "loss": 0.572,
+      "step": 2268
+    },
+    {
+      "epoch": 0.6050666666666666,
+      "grad_norm": 0.3586371413833368,
+      "learning_rate": 7.125838640304559e-05,
+      "loss": 0.6457,
+      "step": 2269
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.3735000740871753,
+      "learning_rate": 7.117566313281345e-05,
+      "loss": 0.6003,
+      "step": 2270
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.35323876792442155,
+      "learning_rate": 7.109296136921515e-05,
+      "loss": 0.5995,
+      "step": 2271
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.33637128275139416,
+      "learning_rate": 7.101028117395681e-05,
+      "loss": 0.5906,
+      "step": 2272
+    },
+    {
+      "epoch": 0.6061333333333333,
+      "grad_norm": 0.3701141598599618,
+      "learning_rate": 7.092762260872828e-05,
+      "loss": 0.6305,
+      "step": 2273
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.38931078042726064,
+      "learning_rate": 7.084498573520348e-05,
+      "loss": 0.6676,
+      "step": 2274
+    },
+    {
+      "epoch": 0.6066666666666667,
+      "grad_norm": 0.3586114052701087,
+      "learning_rate": 7.076237061504007e-05,
+      "loss": 0.652,
+      "step": 2275
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.3336436269440386,
+      "learning_rate": 7.067977730987942e-05,
+      "loss": 0.6056,
+      "step": 2276
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.3484378648642681,
+      "learning_rate": 7.059720588134672e-05,
+      "loss": 0.5668,
+      "step": 2277
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.34757392437088325,
+      "learning_rate": 7.05146563910508e-05,
+      "loss": 0.5414,
+      "step": 2278
+    },
+    {
+      "epoch": 0.6077333333333333,
+      "grad_norm": 0.3451030747030425,
+      "learning_rate": 7.043212890058416e-05,
+      "loss": 0.5766,
+      "step": 2279
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.336361264043929,
+      "learning_rate": 7.03496234715227e-05,
+      "loss": 0.5947,
+      "step": 2280
+    },
+    {
+      "epoch": 0.6082666666666666,
+      "grad_norm": 0.36027530875664027,
+      "learning_rate": 7.026714016542611e-05,
+      "loss": 0.5956,
+      "step": 2281
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.3652789805976303,
+      "learning_rate": 7.018467904383741e-05,
+      "loss": 0.6146,
+      "step": 2282
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.3397631443093309,
+      "learning_rate": 7.010224016828316e-05,
+      "loss": 0.5734,
+      "step": 2283
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.3496392467834162,
+      "learning_rate": 7.001982360027324e-05,
+      "loss": 0.6544,
+      "step": 2284
+    },
+    {
+      "epoch": 0.6093333333333333,
+      "grad_norm": 0.34710771111578964,
+      "learning_rate": 6.993742940130097e-05,
+      "loss": 0.5631,
+      "step": 2285
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.3282172916460941,
+      "learning_rate": 6.98550576328429e-05,
+      "loss": 0.6248,
+      "step": 2286
+    },
+    {
+      "epoch": 0.6098666666666667,
+      "grad_norm": 0.34740083714067665,
+      "learning_rate": 6.977270835635894e-05,
+      "loss": 0.6039,
+      "step": 2287
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.3450369928481761,
+      "learning_rate": 6.969038163329208e-05,
+      "loss": 0.5721,
+      "step": 2288
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.32838753095150724,
+      "learning_rate": 6.960807752506864e-05,
+      "loss": 0.5916,
+      "step": 2289
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.36201429084135534,
+      "learning_rate": 6.952579609309793e-05,
+      "loss": 0.597,
+      "step": 2290
+    },
+    {
+      "epoch": 0.6109333333333333,
+      "grad_norm": 0.3286486703288118,
+      "learning_rate": 6.94435373987724e-05,
+      "loss": 0.5988,
+      "step": 2291
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3415838874239514,
+      "learning_rate": 6.936130150346758e-05,
+      "loss": 0.615,
+      "step": 2292
+    },
+    {
+      "epoch": 0.6114666666666667,
+      "grad_norm": 0.37671731762232635,
+      "learning_rate": 6.92790884685419e-05,
+      "loss": 0.5637,
+      "step": 2293
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.3337898209689772,
+      "learning_rate": 6.919689835533681e-05,
+      "loss": 0.5977,
+      "step": 2294
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.3596518136966967,
+      "learning_rate": 6.91147312251766e-05,
+      "loss": 0.6102,
+      "step": 2295
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.34362279762458187,
+      "learning_rate": 6.903258713936843e-05,
+      "loss": 0.6262,
+      "step": 2296
+    },
+    {
+      "epoch": 0.6125333333333334,
+      "grad_norm": 0.3464009451685017,
+      "learning_rate": 6.895046615920229e-05,
+      "loss": 0.5993,
+      "step": 2297
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.35826322775097214,
+      "learning_rate": 6.88683683459509e-05,
+      "loss": 0.5708,
+      "step": 2298
+    },
+    {
+      "epoch": 0.6130666666666666,
+      "grad_norm": 0.3713496698997063,
+      "learning_rate": 6.878629376086969e-05,
+      "loss": 0.6001,
+      "step": 2299
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.3451019877473833,
+      "learning_rate": 6.870424246519682e-05,
+      "loss": 0.5785,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.3220426975261808,
+      "learning_rate": 6.862221452015298e-05,
+      "loss": 0.5986,
+      "step": 2301
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.3763352966281929,
+      "learning_rate": 6.854020998694152e-05,
+      "loss": 0.6406,
+      "step": 2302
+    },
+    {
+      "epoch": 0.6141333333333333,
+      "grad_norm": 0.34134138605415293,
+      "learning_rate": 6.845822892674829e-05,
+      "loss": 0.5792,
+      "step": 2303
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.38388924190673807,
+      "learning_rate": 6.837627140074159e-05,
+      "loss": 0.6398,
+      "step": 2304
+    },
+    {
+      "epoch": 0.6146666666666667,
+      "grad_norm": 0.34419290228058447,
+      "learning_rate": 6.829433747007221e-05,
+      "loss": 0.6127,
+      "step": 2305
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.3619163626067995,
+      "learning_rate": 6.821242719587331e-05,
+      "loss": 0.6138,
+      "step": 2306
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.33745554573796577,
+      "learning_rate": 6.813054063926044e-05,
+      "loss": 0.6227,
+      "step": 2307
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.33848523185725055,
+      "learning_rate": 6.804867786133137e-05,
+      "loss": 0.5876,
+      "step": 2308
+    },
+    {
+      "epoch": 0.6157333333333334,
+      "grad_norm": 0.34346243670470383,
+      "learning_rate": 6.796683892316623e-05,
+      "loss": 0.637,
+      "step": 2309
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.36269220774445005,
+      "learning_rate": 6.788502388582727e-05,
+      "loss": 0.6377,
+      "step": 2310
+    },
+    {
+      "epoch": 0.6162666666666666,
+      "grad_norm": 0.33404604813329786,
+      "learning_rate": 6.780323281035903e-05,
+      "loss": 0.59,
+      "step": 2311
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.349935025702179,
+      "learning_rate": 6.772146575778795e-05,
+      "loss": 0.6061,
+      "step": 2312
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.34072266601086976,
+      "learning_rate": 6.76397227891228e-05,
+      "loss": 0.6095,
+      "step": 2313
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.3419996604542034,
+      "learning_rate": 6.755800396535423e-05,
+      "loss": 0.5907,
+      "step": 2314
+    },
+    {
+      "epoch": 0.6173333333333333,
+      "grad_norm": 0.3357700024502719,
+      "learning_rate": 6.747630934745491e-05,
+      "loss": 0.6371,
+      "step": 2315
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3528200296628358,
+      "learning_rate": 6.739463899637945e-05,
+      "loss": 0.597,
+      "step": 2316
+    },
+    {
+      "epoch": 0.6178666666666667,
+      "grad_norm": 0.3504407344694739,
+      "learning_rate": 6.731299297306436e-05,
+      "loss": 0.6013,
+      "step": 2317
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.3408528689710479,
+      "learning_rate": 6.723137133842805e-05,
+      "loss": 0.6198,
+      "step": 2318
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.316655494416179,
+      "learning_rate": 6.714977415337058e-05,
+      "loss": 0.5578,
+      "step": 2319
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.34469447758237826,
+      "learning_rate": 6.706820147877388e-05,
+      "loss": 0.6225,
+      "step": 2320
+    },
+    {
+      "epoch": 0.6189333333333333,
+      "grad_norm": 0.31891601740953823,
+      "learning_rate": 6.698665337550161e-05,
+      "loss": 0.5802,
+      "step": 2321
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.34104339043253923,
+      "learning_rate": 6.690512990439902e-05,
+      "loss": 0.5901,
+      "step": 2322
+    },
+    {
+      "epoch": 0.6194666666666667,
+      "grad_norm": 0.34402374532188523,
+      "learning_rate": 6.682363112629308e-05,
+      "loss": 0.5689,
+      "step": 2323
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.347212354553933,
+      "learning_rate": 6.674215710199226e-05,
+      "loss": 0.6309,
+      "step": 2324
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.3370658855896269,
+      "learning_rate": 6.666070789228655e-05,
+      "loss": 0.5521,
+      "step": 2325
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.3397399840731556,
+      "learning_rate": 6.657928355794752e-05,
+      "loss": 0.5665,
+      "step": 2326
+    },
+    {
+      "epoch": 0.6205333333333334,
+      "grad_norm": 0.35362450046341737,
+      "learning_rate": 6.649788415972804e-05,
+      "loss": 0.6259,
+      "step": 2327
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.37135044171802783,
+      "learning_rate": 6.641650975836248e-05,
+      "loss": 0.5936,
+      "step": 2328
+    },
+    {
+      "epoch": 0.6210666666666667,
+      "grad_norm": 0.3574490093025898,
+      "learning_rate": 6.633516041456654e-05,
+      "loss": 0.5939,
+      "step": 2329
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.34538946609485643,
+      "learning_rate": 6.625383618903718e-05,
+      "loss": 0.6016,
+      "step": 2330
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.3388382630145427,
+      "learning_rate": 6.617253714245268e-05,
+      "loss": 0.5736,
+      "step": 2331
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.3549451875907997,
+      "learning_rate": 6.609126333547249e-05,
+      "loss": 0.587,
+      "step": 2332
+    },
+    {
+      "epoch": 0.6221333333333333,
+      "grad_norm": 0.3698487517284332,
+      "learning_rate": 6.601001482873724e-05,
+      "loss": 0.6586,
+      "step": 2333
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.35090441455602456,
+      "learning_rate": 6.592879168286874e-05,
+      "loss": 0.6151,
+      "step": 2334
+    },
+    {
+      "epoch": 0.6226666666666667,
+      "grad_norm": 0.33661192496141673,
+      "learning_rate": 6.584759395846974e-05,
+      "loss": 0.5856,
+      "step": 2335
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.34836025792196795,
+      "learning_rate": 6.576642171612413e-05,
+      "loss": 0.5628,
+      "step": 2336
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.34069574890406146,
+      "learning_rate": 6.568527501639679e-05,
+      "loss": 0.6179,
+      "step": 2337
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.33665880063585896,
+      "learning_rate": 6.560415391983348e-05,
+      "loss": 0.5873,
+      "step": 2338
+    },
+    {
+      "epoch": 0.6237333333333334,
+      "grad_norm": 0.3583256944706876,
+      "learning_rate": 6.552305848696092e-05,
+      "loss": 0.6082,
+      "step": 2339
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.33452834970612355,
+      "learning_rate": 6.544198877828662e-05,
+      "loss": 0.6091,
+      "step": 2340
+    },
+    {
+      "epoch": 0.6242666666666666,
+      "grad_norm": 0.3446645871496446,
+      "learning_rate": 6.536094485429897e-05,
+      "loss": 0.5962,
+      "step": 2341
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.3413265050364765,
+      "learning_rate": 6.527992677546706e-05,
+      "loss": 0.6085,
+      "step": 2342
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.31934875720828215,
+      "learning_rate": 6.51989346022407e-05,
+      "loss": 0.5955,
+      "step": 2343
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.36026896123073415,
+      "learning_rate": 6.51179683950504e-05,
+      "loss": 0.6107,
+      "step": 2344
+    },
+    {
+      "epoch": 0.6253333333333333,
+      "grad_norm": 0.35697524084038335,
+      "learning_rate": 6.503702821430728e-05,
+      "loss": 0.5941,
+      "step": 2345
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.35827259454318255,
+      "learning_rate": 6.495611412040306e-05,
+      "loss": 0.6597,
+      "step": 2346
+    },
+    {
+      "epoch": 0.6258666666666667,
+      "grad_norm": 0.3754157539262379,
+      "learning_rate": 6.487522617370996e-05,
+      "loss": 0.6106,
+      "step": 2347
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.33984543828264,
+      "learning_rate": 6.479436443458072e-05,
+      "loss": 0.5775,
+      "step": 2348
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.3281350978505545,
+      "learning_rate": 6.471352896334851e-05,
+      "loss": 0.571,
+      "step": 2349
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.32423939957163334,
+      "learning_rate": 6.463271982032695e-05,
+      "loss": 0.6119,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6269333333333333,
+      "grad_norm": 0.36810359773927115,
+      "learning_rate": 6.45519370658099e-05,
+      "loss": 0.6534,
+      "step": 2351
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3195658184081652,
+      "learning_rate": 6.447118076007165e-05,
+      "loss": 0.5435,
+      "step": 2352
+    },
+    {
+      "epoch": 0.6274666666666666,
+      "grad_norm": 0.33663837766452803,
+      "learning_rate": 6.43904509633667e-05,
+      "loss": 0.5974,
+      "step": 2353
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.34826678764415414,
+      "learning_rate": 6.43097477359298e-05,
+      "loss": 0.5899,
+      "step": 2354
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.32103005144329616,
+      "learning_rate": 6.422907113797581e-05,
+      "loss": 0.6155,
+      "step": 2355
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.35378196416816504,
+      "learning_rate": 6.414842122969981e-05,
+      "loss": 0.6276,
+      "step": 2356
+    },
+    {
+      "epoch": 0.6285333333333334,
+      "grad_norm": 0.3810926164409607,
+      "learning_rate": 6.406779807127695e-05,
+      "loss": 0.5984,
+      "step": 2357
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.35133122231985336,
+      "learning_rate": 6.398720172286231e-05,
+      "loss": 0.6007,
+      "step": 2358
+    },
+    {
+      "epoch": 0.6290666666666667,
+      "grad_norm": 0.33961515311091645,
+      "learning_rate": 6.390663224459111e-05,
+      "loss": 0.608,
+      "step": 2359
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.34796322280463593,
+      "learning_rate": 6.382608969657846e-05,
+      "loss": 0.6124,
+      "step": 2360
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3526323514364444,
+      "learning_rate": 6.374557413891936e-05,
+      "loss": 0.6259,
+      "step": 2361
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.33417535778932184,
+      "learning_rate": 6.36650856316887e-05,
+      "loss": 0.5916,
+      "step": 2362
+    },
+    {
+      "epoch": 0.6301333333333333,
+      "grad_norm": 0.32604666321789155,
+      "learning_rate": 6.35846242349412e-05,
+      "loss": 0.5737,
+      "step": 2363
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.32609304597025857,
+      "learning_rate": 6.350419000871129e-05,
+      "loss": 0.599,
+      "step": 2364
+    },
+    {
+      "epoch": 0.6306666666666667,
+      "grad_norm": 0.3449149497273829,
+      "learning_rate": 6.342378301301324e-05,
+      "loss": 0.5899,
+      "step": 2365
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.3589645806850102,
+      "learning_rate": 6.334340330784083e-05,
+      "loss": 0.6403,
+      "step": 2366
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.33156136343929354,
+      "learning_rate": 6.326305095316762e-05,
+      "loss": 0.5803,
+      "step": 2367
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.3275785959556299,
+      "learning_rate": 6.318272600894675e-05,
+      "loss": 0.5368,
+      "step": 2368
+    },
+    {
+      "epoch": 0.6317333333333334,
+      "grad_norm": 0.3601974862755085,
+      "learning_rate": 6.310242853511083e-05,
+      "loss": 0.633,
+      "step": 2369
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.3422910202857035,
+      "learning_rate": 6.302215859157208e-05,
+      "loss": 0.562,
+      "step": 2370
+    },
+    {
+      "epoch": 0.6322666666666666,
+      "grad_norm": 0.38307555025222,
+      "learning_rate": 6.294191623822207e-05,
+      "loss": 0.6391,
+      "step": 2371
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.36061672282013274,
+      "learning_rate": 6.286170153493188e-05,
+      "loss": 0.687,
+      "step": 2372
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.35955909782102774,
+      "learning_rate": 6.278151454155192e-05,
+      "loss": 0.5953,
+      "step": 2373
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.3530964829173126,
+      "learning_rate": 6.270135531791187e-05,
+      "loss": 0.5876,
+      "step": 2374
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 0.3391126742582884,
+      "learning_rate": 6.262122392382075e-05,
+      "loss": 0.5696,
+      "step": 2375
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.36073389005568296,
+      "learning_rate": 6.254112041906683e-05,
+      "loss": 0.5911,
+      "step": 2376
+    },
+    {
+      "epoch": 0.6338666666666667,
+      "grad_norm": 0.3980083373201319,
+      "learning_rate": 6.246104486341753e-05,
+      "loss": 0.6094,
+      "step": 2377
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.32413698778242334,
+      "learning_rate": 6.238099731661942e-05,
+      "loss": 0.5743,
+      "step": 2378
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.3382939342418333,
+      "learning_rate": 6.230097783839825e-05,
+      "loss": 0.5994,
+      "step": 2379
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.3393203304566258,
+      "learning_rate": 6.22209864884587e-05,
+      "loss": 0.5709,
+      "step": 2380
+    },
+    {
+      "epoch": 0.6349333333333333,
+      "grad_norm": 0.33839393026082126,
+      "learning_rate": 6.21410233264846e-05,
+      "loss": 0.6083,
+      "step": 2381
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3711960839308339,
+      "learning_rate": 6.206108841213856e-05,
+      "loss": 0.6469,
+      "step": 2382
+    },
+    {
+      "epoch": 0.6354666666666666,
+      "grad_norm": 0.341931410734462,
+      "learning_rate": 6.19811818050623e-05,
+      "loss": 0.5604,
+      "step": 2383
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.3247703605715169,
+      "learning_rate": 6.190130356487634e-05,
+      "loss": 0.5812,
+      "step": 2384
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.35434578547863294,
+      "learning_rate": 6.182145375118002e-05,
+      "loss": 0.5554,
+      "step": 2385
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.3556056044213876,
+      "learning_rate": 6.17416324235515e-05,
+      "loss": 0.6131,
+      "step": 2386
+    },
+    {
+      "epoch": 0.6365333333333333,
+      "grad_norm": 0.3724091841904112,
+      "learning_rate": 6.16618396415477e-05,
+      "loss": 0.6079,
+      "step": 2387
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.34070475795863037,
+      "learning_rate": 6.158207546470421e-05,
+      "loss": 0.6456,
+      "step": 2388
+    },
+    {
+      "epoch": 0.6370666666666667,
+      "grad_norm": 0.33656276566350446,
+      "learning_rate": 6.150233995253527e-05,
+      "loss": 0.608,
+      "step": 2389
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.3605240523953494,
+      "learning_rate": 6.142263316453377e-05,
+      "loss": 0.5943,
+      "step": 2390
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.3453808744153834,
+      "learning_rate": 6.134295516017109e-05,
+      "loss": 0.6191,
+      "step": 2391
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.3117371998839096,
+      "learning_rate": 6.126330599889724e-05,
+      "loss": 0.5439,
+      "step": 2392
+    },
+    {
+      "epoch": 0.6381333333333333,
+      "grad_norm": 0.31898691886780756,
+      "learning_rate": 6.118368574014066e-05,
+      "loss": 0.5318,
+      "step": 2393
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.36728965219198434,
+      "learning_rate": 6.110409444330823e-05,
+      "loss": 0.608,
+      "step": 2394
+    },
+    {
+      "epoch": 0.6386666666666667,
+      "grad_norm": 0.35233809111531966,
+      "learning_rate": 6.1024532167785164e-05,
+      "loss": 0.5759,
+      "step": 2395
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.3344103431990208,
+      "learning_rate": 6.094499897293515e-05,
+      "loss": 0.5932,
+      "step": 2396
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.3337176651337381,
+      "learning_rate": 6.086549491810003e-05,
+      "loss": 0.5967,
+      "step": 2397
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.33263926634069596,
+      "learning_rate": 6.0786020062600016e-05,
+      "loss": 0.605,
+      "step": 2398
+    },
+    {
+      "epoch": 0.6397333333333334,
+      "grad_norm": 0.3923339162462857,
+      "learning_rate": 6.070657446573347e-05,
+      "loss": 0.6399,
+      "step": 2399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.33958658057735047,
+      "learning_rate": 6.062715818677696e-05,
+      "loss": 0.5974,
+      "step": 2400
+    },
+    {
+      "epoch": 0.6402666666666667,
+      "grad_norm": 0.3300127156212101,
+      "learning_rate": 6.054777128498515e-05,
+      "loss": 0.5568,
+      "step": 2401
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.3581778039589509,
+      "learning_rate": 6.046841381959082e-05,
+      "loss": 0.5886,
+      "step": 2402
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.32294944624189975,
+      "learning_rate": 6.038908584980476e-05,
+      "loss": 0.5943,
+      "step": 2403
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.348186094768716,
+      "learning_rate": 6.030978743481578e-05,
+      "loss": 0.6176,
+      "step": 2404
+    },
+    {
+      "epoch": 0.6413333333333333,
+      "grad_norm": 0.37498759929633807,
+      "learning_rate": 6.023051863379057e-05,
+      "loss": 0.6029,
+      "step": 2405
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.32591324421105833,
+      "learning_rate": 6.01512795058738e-05,
+      "loss": 0.5429,
+      "step": 2406
+    },
+    {
+      "epoch": 0.6418666666666667,
+      "grad_norm": 0.360212112950924,
+      "learning_rate": 6.007207011018796e-05,
+      "loss": 0.578,
+      "step": 2407
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.35859697448639555,
+      "learning_rate": 5.999289050583339e-05,
+      "loss": 0.5891,
+      "step": 2408
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.3415563326990654,
+      "learning_rate": 5.991374075188816e-05,
+      "loss": 0.5936,
+      "step": 2409
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.36459265659956647,
+      "learning_rate": 5.98346209074081e-05,
+      "loss": 0.6104,
+      "step": 2410
+    },
+    {
+      "epoch": 0.6429333333333334,
+      "grad_norm": 0.34660026757714923,
+      "learning_rate": 5.975553103142669e-05,
+      "loss": 0.6186,
+      "step": 2411
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3522029968780649,
+      "learning_rate": 5.9676471182955116e-05,
+      "loss": 0.6096,
+      "step": 2412
+    },
+    {
+      "epoch": 0.6434666666666666,
+      "grad_norm": 0.34374410390207266,
+      "learning_rate": 5.959744142098207e-05,
+      "loss": 0.6051,
+      "step": 2413
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.34667109670639695,
+      "learning_rate": 5.9518441804473846e-05,
+      "loss": 0.6021,
+      "step": 2414
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.358636275303948,
+      "learning_rate": 5.943947239237424e-05,
+      "loss": 0.5781,
+      "step": 2415
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.32347244785117024,
+      "learning_rate": 5.936053324360453e-05,
+      "loss": 0.5727,
+      "step": 2416
+    },
+    {
+      "epoch": 0.6445333333333333,
+      "grad_norm": 0.33767847752157737,
+      "learning_rate": 5.9281624417063395e-05,
+      "loss": 0.6232,
+      "step": 2417
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.32149237569745887,
+      "learning_rate": 5.9202745971626864e-05,
+      "loss": 0.5228,
+      "step": 2418
+    },
+    {
+      "epoch": 0.6450666666666667,
+      "grad_norm": 0.35127631413085253,
+      "learning_rate": 5.912389796614835e-05,
+      "loss": 0.6093,
+      "step": 2419
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.35284551550376864,
+      "learning_rate": 5.9045080459458535e-05,
+      "loss": 0.6011,
+      "step": 2420
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.342214804035069,
+      "learning_rate": 5.89662935103653e-05,
+      "loss": 0.5861,
+      "step": 2421
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.3308244044650585,
+      "learning_rate": 5.8887537177653786e-05,
+      "loss": 0.5929,
+      "step": 2422
+    },
+    {
+      "epoch": 0.6461333333333333,
+      "grad_norm": 0.3241516134333082,
+      "learning_rate": 5.880881152008623e-05,
+      "loss": 0.5774,
+      "step": 2423
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3346715824571503,
+      "learning_rate": 5.8730116596402084e-05,
+      "loss": 0.6211,
+      "step": 2424
+    },
+    {
+      "epoch": 0.6466666666666666,
+      "grad_norm": 0.3364017085713504,
+      "learning_rate": 5.865145246531776e-05,
+      "loss": 0.5951,
+      "step": 2425
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.350056816015172,
+      "learning_rate": 5.857281918552677e-05,
+      "loss": 0.5903,
+      "step": 2426
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.32984344774870605,
+      "learning_rate": 5.8494216815699556e-05,
+      "loss": 0.5915,
+      "step": 2427
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.7645698228696151,
+      "learning_rate": 5.841564541448356e-05,
+      "loss": 0.5994,
+      "step": 2428
+    },
+    {
+      "epoch": 0.6477333333333334,
+      "grad_norm": 0.36233641363172175,
+      "learning_rate": 5.833710504050298e-05,
+      "loss": 0.5436,
+      "step": 2429
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.38125683813856825,
+      "learning_rate": 5.8258595752359036e-05,
+      "loss": 0.6347,
+      "step": 2430
+    },
+    {
+      "epoch": 0.6482666666666667,
+      "grad_norm": 0.3514245491403789,
+      "learning_rate": 5.8180117608629645e-05,
+      "loss": 0.577,
+      "step": 2431
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.342700705782899,
+      "learning_rate": 5.810167066786951e-05,
+      "loss": 0.6108,
+      "step": 2432
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.3386868299620465,
+      "learning_rate": 5.80232549886101e-05,
+      "loss": 0.5576,
+      "step": 2433
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.35904758414658006,
+      "learning_rate": 5.794487062935948e-05,
+      "loss": 0.6069,
+      "step": 2434
+    },
+    {
+      "epoch": 0.6493333333333333,
+      "grad_norm": 0.32993308622412953,
+      "learning_rate": 5.78665176486024e-05,
+      "loss": 0.5499,
+      "step": 2435
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.34450413336212,
+      "learning_rate": 5.7788196104800194e-05,
+      "loss": 0.6022,
+      "step": 2436
+    },
+    {
+      "epoch": 0.6498666666666667,
+      "grad_norm": 0.3397398179716159,
+      "learning_rate": 5.770990605639071e-05,
+      "loss": 0.5983,
+      "step": 2437
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.33207919165370103,
+      "learning_rate": 5.763164756178833e-05,
+      "loss": 0.5554,
+      "step": 2438
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.36498725108404595,
+      "learning_rate": 5.755342067938386e-05,
+      "loss": 0.6021,
+      "step": 2439
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.33331872270019547,
+      "learning_rate": 5.747522546754456e-05,
+      "loss": 0.6,
+      "step": 2440
+    },
+    {
+      "epoch": 0.6509333333333334,
+      "grad_norm": 0.3388025965934976,
+      "learning_rate": 5.739706198461402e-05,
+      "loss": 0.6461,
+      "step": 2441
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3345424369891375,
+      "learning_rate": 5.731893028891218e-05,
+      "loss": 0.6068,
+      "step": 2442
+    },
+    {
+      "epoch": 0.6514666666666666,
+      "grad_norm": 0.34908972711079245,
+      "learning_rate": 5.7240830438735295e-05,
+      "loss": 0.5962,
+      "step": 2443
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.34496049207322027,
+      "learning_rate": 5.7162762492355746e-05,
+      "loss": 0.5779,
+      "step": 2444
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.3438736478434742,
+      "learning_rate": 5.708472650802221e-05,
+      "loss": 0.5931,
+      "step": 2445
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.4031791703967563,
+      "learning_rate": 5.7006722543959515e-05,
+      "loss": 0.6027,
+      "step": 2446
+    },
+    {
+      "epoch": 0.6525333333333333,
+      "grad_norm": 0.34634120686413566,
+      "learning_rate": 5.6928750658368555e-05,
+      "loss": 0.5972,
+      "step": 2447
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.33949998651994945,
+      "learning_rate": 5.68508109094263e-05,
+      "loss": 0.6313,
+      "step": 2448
+    },
+    {
+      "epoch": 0.6530666666666667,
+      "grad_norm": 0.34180683935263556,
+      "learning_rate": 5.6772903355285755e-05,
+      "loss": 0.5996,
+      "step": 2449
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.3621528767223928,
+      "learning_rate": 5.669502805407591e-05,
+      "loss": 0.574,
+      "step": 2450
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.34832301863962545,
+      "learning_rate": 5.6617185063901714e-05,
+      "loss": 0.5673,
+      "step": 2451
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.346287044119997,
+      "learning_rate": 5.653937444284389e-05,
+      "loss": 0.6045,
+      "step": 2452
+    },
+    {
+      "epoch": 0.6541333333333333,
+      "grad_norm": 0.3566316323533672,
+      "learning_rate": 5.6461596248959115e-05,
+      "loss": 0.602,
+      "step": 2453
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.3287837448539727,
+      "learning_rate": 5.638385054027987e-05,
+      "loss": 0.5895,
+      "step": 2454
+    },
+    {
+      "epoch": 0.6546666666666666,
+      "grad_norm": 0.3613129767717555,
+      "learning_rate": 5.6306137374814363e-05,
+      "loss": 0.6258,
+      "step": 2455
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.3566901495801631,
+      "learning_rate": 5.622845681054651e-05,
+      "loss": 0.63,
+      "step": 2456
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.35772424480405757,
+      "learning_rate": 5.6150808905435984e-05,
+      "loss": 0.6003,
+      "step": 2457
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.34052185284268577,
+      "learning_rate": 5.607319371741799e-05,
+      "loss": 0.6489,
+      "step": 2458
+    },
+    {
+      "epoch": 0.6557333333333333,
+      "grad_norm": 0.347116530437614,
+      "learning_rate": 5.599561130440343e-05,
+      "loss": 0.5959,
+      "step": 2459
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.33518404133193597,
+      "learning_rate": 5.5918061724278584e-05,
+      "loss": 0.5906,
+      "step": 2460
+    },
+    {
+      "epoch": 0.6562666666666667,
+      "grad_norm": 0.3476084147046415,
+      "learning_rate": 5.5840545034905365e-05,
+      "loss": 0.6216,
+      "step": 2461
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.3338795967471878,
+      "learning_rate": 5.5763061294121154e-05,
+      "loss": 0.5856,
+      "step": 2462
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.36357983418745365,
+      "learning_rate": 5.568561055973868e-05,
+      "loss": 0.5714,
+      "step": 2463
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.3508803528152843,
+      "learning_rate": 5.5608192889546085e-05,
+      "loss": 0.5932,
+      "step": 2464
+    },
+    {
+      "epoch": 0.6573333333333333,
+      "grad_norm": 0.46003479213026277,
+      "learning_rate": 5.553080834130682e-05,
+      "loss": 0.6168,
+      "step": 2465
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.3397964668589436,
+      "learning_rate": 5.545345697275964e-05,
+      "loss": 0.5933,
+      "step": 2466
+    },
+    {
+      "epoch": 0.6578666666666667,
+      "grad_norm": 0.3671332812649807,
+      "learning_rate": 5.537613884161859e-05,
+      "loss": 0.5557,
+      "step": 2467
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.3381807353666364,
+      "learning_rate": 5.529885400557277e-05,
+      "loss": 0.5397,
+      "step": 2468
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.353728469181082,
+      "learning_rate": 5.5221602522286565e-05,
+      "loss": 0.5924,
+      "step": 2469
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.36028533391105544,
+      "learning_rate": 5.5144384449399466e-05,
+      "loss": 0.5982,
+      "step": 2470
+    },
+    {
+      "epoch": 0.6589333333333334,
+      "grad_norm": 0.3682945885183188,
+      "learning_rate": 5.506719984452597e-05,
+      "loss": 0.6169,
+      "step": 2471
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.35033546794801473,
+      "learning_rate": 5.499004876525569e-05,
+      "loss": 0.5766,
+      "step": 2472
+    },
+    {
+      "epoch": 0.6594666666666666,
+      "grad_norm": 0.3589300885290531,
+      "learning_rate": 5.4912931269153134e-05,
+      "loss": 0.6766,
+      "step": 2473
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.34090204625039444,
+      "learning_rate": 5.483584741375781e-05,
+      "loss": 0.5639,
+      "step": 2474
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.3686975922347045,
+      "learning_rate": 5.475879725658413e-05,
+      "loss": 0.6005,
+      "step": 2475
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.3944519721443153,
+      "learning_rate": 5.468178085512132e-05,
+      "loss": 0.6252,
+      "step": 2476
+    },
+    {
+      "epoch": 0.6605333333333333,
+      "grad_norm": 0.3999240027948932,
+      "learning_rate": 5.4604798266833455e-05,
+      "loss": 0.6019,
+      "step": 2477
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3295366763050033,
+      "learning_rate": 5.452784954915937e-05,
+      "loss": 0.5929,
+      "step": 2478
+    },
+    {
+      "epoch": 0.6610666666666667,
+      "grad_norm": 0.37837611354732187,
+      "learning_rate": 5.445093475951263e-05,
+      "loss": 0.6029,
+      "step": 2479
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.4533398937731497,
+      "learning_rate": 5.437405395528148e-05,
+      "loss": 0.5427,
+      "step": 2480
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.3436047315121984,
+      "learning_rate": 5.4297207193828804e-05,
+      "loss": 0.555,
+      "step": 2481
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.34018489822301473,
+      "learning_rate": 5.422039453249216e-05,
+      "loss": 0.5574,
+      "step": 2482
+    },
+    {
+      "epoch": 0.6621333333333334,
+      "grad_norm": 0.33501399145007826,
+      "learning_rate": 5.414361602858349e-05,
+      "loss": 0.5746,
+      "step": 2483
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.369499967322875,
+      "learning_rate": 5.40668717393894e-05,
+      "loss": 0.6149,
+      "step": 2484
+    },
+    {
+      "epoch": 0.6626666666666666,
+      "grad_norm": 0.35805023715592077,
+      "learning_rate": 5.399016172217093e-05,
+      "loss": 0.6133,
+      "step": 2485
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.32037798114227906,
+      "learning_rate": 5.391348603416353e-05,
+      "loss": 0.5607,
+      "step": 2486
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.3544221484186545,
+      "learning_rate": 5.383684473257707e-05,
+      "loss": 0.6019,
+      "step": 2487
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.3544589771151001,
+      "learning_rate": 5.376023787459574e-05,
+      "loss": 0.6019,
+      "step": 2488
+    },
+    {
+      "epoch": 0.6637333333333333,
+      "grad_norm": 0.3374719310464669,
+      "learning_rate": 5.3683665517378004e-05,
+      "loss": 0.6061,
+      "step": 2489
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3468952660891746,
+      "learning_rate": 5.3607127718056695e-05,
+      "loss": 0.5899,
+      "step": 2490
+    },
+    {
+      "epoch": 0.6642666666666667,
+      "grad_norm": 0.3407278819614768,
+      "learning_rate": 5.353062453373862e-05,
+      "loss": 0.5522,
+      "step": 2491
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.33739507650594036,
+      "learning_rate": 5.3454156021505055e-05,
+      "loss": 0.5983,
+      "step": 2492
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.33262547013987365,
+      "learning_rate": 5.337772223841122e-05,
+      "loss": 0.5613,
+      "step": 2493
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.3648979827617922,
+      "learning_rate": 5.330132324148649e-05,
+      "loss": 0.5623,
+      "step": 2494
+    },
+    {
+      "epoch": 0.6653333333333333,
+      "grad_norm": 0.32865851853796996,
+      "learning_rate": 5.3224959087734264e-05,
+      "loss": 0.6187,
+      "step": 2495
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3983097493320104,
+      "learning_rate": 5.3148629834131925e-05,
+      "loss": 0.5927,
+      "step": 2496
+    },
+    {
+      "epoch": 0.6658666666666667,
+      "grad_norm": 0.3664175826588646,
+      "learning_rate": 5.3072335537630845e-05,
+      "loss": 0.5455,
+      "step": 2497
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.31722078152711936,
+      "learning_rate": 5.299607625515637e-05,
+      "loss": 0.5379,
+      "step": 2498
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.3360887776404759,
+      "learning_rate": 5.291985204360754e-05,
+      "loss": 0.5521,
+      "step": 2499
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.34364499649892183,
+      "learning_rate": 5.284366295985741e-05,
+      "loss": 0.6079,
+      "step": 2500
+    },
+    {
+      "epoch": 0.6669333333333334,
+      "grad_norm": 0.33762813065562547,
+      "learning_rate": 5.2767509060752764e-05,
+      "loss": 0.5667,
+      "step": 2501
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3616585325541724,
+      "learning_rate": 5.269139040311411e-05,
+      "loss": 0.6244,
+      "step": 2502
+    },
+    {
+      "epoch": 0.6674666666666667,
+      "grad_norm": 0.3443235724588427,
+      "learning_rate": 5.2615307043735676e-05,
+      "loss": 0.5713,
+      "step": 2503
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.3438457858788958,
+      "learning_rate": 5.253925903938538e-05,
+      "loss": 0.5999,
+      "step": 2504
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.37071917277564054,
+      "learning_rate": 5.2463246446804725e-05,
+      "loss": 0.643,
+      "step": 2505
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.3488194676574523,
+      "learning_rate": 5.2387269322708854e-05,
+      "loss": 0.5656,
+      "step": 2506
+    },
+    {
+      "epoch": 0.6685333333333333,
+      "grad_norm": 0.32135902286096674,
+      "learning_rate": 5.231132772378631e-05,
+      "loss": 0.5599,
+      "step": 2507
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3587053415055037,
+      "learning_rate": 5.223542170669926e-05,
+      "loss": 0.6155,
+      "step": 2508
+    },
+    {
+      "epoch": 0.6690666666666667,
+      "grad_norm": 0.33425986489072557,
+      "learning_rate": 5.215955132808328e-05,
+      "loss": 0.5712,
+      "step": 2509
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.3519724619626593,
+      "learning_rate": 5.2083716644547364e-05,
+      "loss": 0.5983,
+      "step": 2510
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.35133495988947727,
+      "learning_rate": 5.200791771267384e-05,
+      "loss": 0.574,
+      "step": 2511
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.3456925346287904,
+      "learning_rate": 5.193215458901837e-05,
+      "loss": 0.6037,
+      "step": 2512
+    },
+    {
+      "epoch": 0.6701333333333334,
+      "grad_norm": 0.34366282667650583,
+      "learning_rate": 5.1856427330110025e-05,
+      "loss": 0.5836,
+      "step": 2513
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.3367418436781026,
+      "learning_rate": 5.1780735992450865e-05,
+      "loss": 0.6221,
+      "step": 2514
+    },
+    {
+      "epoch": 0.6706666666666666,
+      "grad_norm": 0.3490656912191057,
+      "learning_rate": 5.170508063251636e-05,
+      "loss": 0.6183,
+      "step": 2515
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.35253745610459775,
+      "learning_rate": 5.162946130675503e-05,
+      "loss": 0.596,
+      "step": 2516
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.31980301046040727,
+      "learning_rate": 5.1553878071588576e-05,
+      "loss": 0.5479,
+      "step": 2517
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.3404904477153669,
+      "learning_rate": 5.147833098341173e-05,
+      "loss": 0.5365,
+      "step": 2518
+    },
+    {
+      "epoch": 0.6717333333333333,
+      "grad_norm": 0.3535567861550599,
+      "learning_rate": 5.140282009859224e-05,
+      "loss": 0.5345,
+      "step": 2519
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.35489481564989434,
+      "learning_rate": 5.132734547347088e-05,
+      "loss": 0.5914,
+      "step": 2520
+    },
+    {
+      "epoch": 0.6722666666666667,
+      "grad_norm": 0.3475996798308405,
+      "learning_rate": 5.125190716436139e-05,
+      "loss": 0.5836,
+      "step": 2521
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.3450635115813747,
+      "learning_rate": 5.1176505227550286e-05,
+      "loss": 0.6197,
+      "step": 2522
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.37723384466315196,
+      "learning_rate": 5.110113971929708e-05,
+      "loss": 0.5709,
+      "step": 2523
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.32562501966448837,
+      "learning_rate": 5.102581069583407e-05,
+      "loss": 0.5501,
+      "step": 2524
+    },
+    {
+      "epoch": 0.6733333333333333,
+      "grad_norm": 0.31776203077361176,
+      "learning_rate": 5.0950518213366314e-05,
+      "loss": 0.5482,
+      "step": 2525
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.33030440418162826,
+      "learning_rate": 5.08752623280716e-05,
+      "loss": 0.5357,
+      "step": 2526
+    },
+    {
+      "epoch": 0.6738666666666666,
+      "grad_norm": 0.33907909419975807,
+      "learning_rate": 5.080004309610045e-05,
+      "loss": 0.5887,
+      "step": 2527
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.33793583554471157,
+      "learning_rate": 5.0724860573575994e-05,
+      "loss": 0.592,
+      "step": 2528
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.32993594798479337,
+      "learning_rate": 5.064971481659399e-05,
+      "loss": 0.555,
+      "step": 2529
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.3275825963157326,
+      "learning_rate": 5.057460588122276e-05,
+      "loss": 0.5264,
+      "step": 2530
+    },
+    {
+      "epoch": 0.6749333333333334,
+      "grad_norm": 0.3433011176300641,
+      "learning_rate": 5.049953382350314e-05,
+      "loss": 0.5878,
+      "step": 2531
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3698003650110036,
+      "learning_rate": 5.042449869944851e-05,
+      "loss": 0.5604,
+      "step": 2532
+    },
+    {
+      "epoch": 0.6754666666666667,
+      "grad_norm": 0.32268863726411673,
+      "learning_rate": 5.03495005650446e-05,
+      "loss": 0.5561,
+      "step": 2533
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.33428107397463697,
+      "learning_rate": 5.027453947624963e-05,
+      "loss": 0.5495,
+      "step": 2534
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.3446895338822114,
+      "learning_rate": 5.01996154889941e-05,
+      "loss": 0.5431,
+      "step": 2535
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.4679149445936216,
+      "learning_rate": 5.0124728659180895e-05,
+      "loss": 0.6033,
+      "step": 2536
+    },
+    {
+      "epoch": 0.6765333333333333,
+      "grad_norm": 0.3724348593690452,
+      "learning_rate": 5.004987904268519e-05,
+      "loss": 0.6005,
+      "step": 2537
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.39101113395957166,
+      "learning_rate": 4.9975066695354245e-05,
+      "loss": 0.5781,
+      "step": 2538
+    },
+    {
+      "epoch": 0.6770666666666667,
+      "grad_norm": 0.37284538949464013,
+      "learning_rate": 4.990029167300767e-05,
+      "loss": 0.5508,
+      "step": 2539
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.35232556208516896,
+      "learning_rate": 4.9825554031437194e-05,
+      "loss": 0.6228,
+      "step": 2540
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.3454147641906744,
+      "learning_rate": 4.975085382640661e-05,
+      "loss": 0.5943,
+      "step": 2541
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.34753345272013797,
+      "learning_rate": 4.9676191113651804e-05,
+      "loss": 0.5403,
+      "step": 2542
+    },
+    {
+      "epoch": 0.6781333333333334,
+      "grad_norm": 0.3567171906792844,
+      "learning_rate": 4.9601565948880704e-05,
+      "loss": 0.6441,
+      "step": 2543
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3556237187996497,
+      "learning_rate": 4.9526978387773196e-05,
+      "loss": 0.598,
+      "step": 2544
+    },
+    {
+      "epoch": 0.6786666666666666,
+      "grad_norm": 0.40240655144839815,
+      "learning_rate": 4.945242848598116e-05,
+      "loss": 0.641,
+      "step": 2545
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.3370996868151834,
+      "learning_rate": 4.9377916299128226e-05,
+      "loss": 0.6046,
+      "step": 2546
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.37809508207870274,
+      "learning_rate": 4.9303441882810106e-05,
+      "loss": 0.5886,
+      "step": 2547
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.3350394277163161,
+      "learning_rate": 4.9229005292594175e-05,
+      "loss": 0.5931,
+      "step": 2548
+    },
+    {
+      "epoch": 0.6797333333333333,
+      "grad_norm": 0.3410077041033582,
+      "learning_rate": 4.9154606584019644e-05,
+      "loss": 0.5892,
+      "step": 2549
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3578754267506827,
+      "learning_rate": 4.9080245812597434e-05,
+      "loss": 0.5984,
+      "step": 2550
+    },
+    {
+      "epoch": 0.6802666666666667,
+      "grad_norm": 0.32861571746387214,
+      "learning_rate": 4.900592303381016e-05,
+      "loss": 0.5818,
+      "step": 2551
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.4002809734898077,
+      "learning_rate": 4.893163830311216e-05,
+      "loss": 0.5982,
+      "step": 2552
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.3393118343346211,
+      "learning_rate": 4.885739167592923e-05,
+      "loss": 0.5679,
+      "step": 2553
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.335548330799039,
+      "learning_rate": 4.878318320765888e-05,
+      "loss": 0.5837,
+      "step": 2554
+    },
+    {
+      "epoch": 0.6813333333333333,
+      "grad_norm": 0.32554726875807616,
+      "learning_rate": 4.8709012953670096e-05,
+      "loss": 0.5746,
+      "step": 2555
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.348240853733335,
+      "learning_rate": 4.863488096930333e-05,
+      "loss": 0.5905,
+      "step": 2556
+    },
+    {
+      "epoch": 0.6818666666666666,
+      "grad_norm": 0.3227300220745892,
+      "learning_rate": 4.856078730987054e-05,
+      "loss": 0.5835,
+      "step": 2557
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.33548187568536797,
+      "learning_rate": 4.848673203065502e-05,
+      "loss": 0.5162,
+      "step": 2558
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.3403186240533581,
+      "learning_rate": 4.841271518691149e-05,
+      "loss": 0.5509,
+      "step": 2559
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.3750547649216263,
+      "learning_rate": 4.833873683386596e-05,
+      "loss": 0.6344,
+      "step": 2560
+    },
+    {
+      "epoch": 0.6829333333333333,
+      "grad_norm": 0.3230688623238985,
+      "learning_rate": 4.826479702671567e-05,
+      "loss": 0.5731,
+      "step": 2561
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.32793200127882455,
+      "learning_rate": 4.81908958206292e-05,
+      "loss": 0.5442,
+      "step": 2562
+    },
+    {
+      "epoch": 0.6834666666666667,
+      "grad_norm": 0.35651353869835617,
+      "learning_rate": 4.811703327074626e-05,
+      "loss": 0.6099,
+      "step": 2563
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.35507964722126156,
+      "learning_rate": 4.804320943217775e-05,
+      "loss": 0.5938,
+      "step": 2564
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.34955903687334955,
+      "learning_rate": 4.796942436000568e-05,
+      "loss": 0.5815,
+      "step": 2565
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.34118412145094035,
+      "learning_rate": 4.7895678109283116e-05,
+      "loss": 0.5531,
+      "step": 2566
+    },
+    {
+      "epoch": 0.6845333333333333,
+      "grad_norm": 0.3489463603151044,
+      "learning_rate": 4.782197073503414e-05,
+      "loss": 0.5722,
+      "step": 2567
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.34257297902736417,
+      "learning_rate": 4.774830229225398e-05,
+      "loss": 0.5662,
+      "step": 2568
+    },
+    {
+      "epoch": 0.6850666666666667,
+      "grad_norm": 0.33510491412642573,
+      "learning_rate": 4.767467283590856e-05,
+      "loss": 0.5366,
+      "step": 2569
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.3559397833904609,
+      "learning_rate": 4.7601082420934925e-05,
+      "loss": 0.6124,
+      "step": 2570
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.4037254310154087,
+      "learning_rate": 4.752753110224089e-05,
+      "loss": 0.6123,
+      "step": 2571
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.3478290984169354,
+      "learning_rate": 4.7454018934705126e-05,
+      "loss": 0.598,
+      "step": 2572
+    },
+    {
+      "epoch": 0.6861333333333334,
+      "grad_norm": 0.33529657369376437,
+      "learning_rate": 4.7380545973177107e-05,
+      "loss": 0.5934,
+      "step": 2573
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.34014238865796187,
+      "learning_rate": 4.730711227247703e-05,
+      "loss": 0.5591,
+      "step": 2574
+    },
+    {
+      "epoch": 0.6866666666666666,
+      "grad_norm": 0.3372149416578562,
+      "learning_rate": 4.72337178873958e-05,
+      "loss": 0.6029,
+      "step": 2575
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.3865273943149658,
+      "learning_rate": 4.716036287269504e-05,
+      "loss": 0.6013,
+      "step": 2576
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.37867397996445734,
+      "learning_rate": 4.708704728310688e-05,
+      "loss": 0.6054,
+      "step": 2577
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.33300064292744663,
+      "learning_rate": 4.701377117333413e-05,
+      "loss": 0.5746,
+      "step": 2578
+    },
+    {
+      "epoch": 0.6877333333333333,
+      "grad_norm": 0.3412960539150721,
+      "learning_rate": 4.6940534598050135e-05,
+      "loss": 0.5783,
+      "step": 2579
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.34868200147958184,
+      "learning_rate": 4.686733761189872e-05,
+      "loss": 0.619,
+      "step": 2580
+    },
+    {
+      "epoch": 0.6882666666666667,
+      "grad_norm": 0.38928620427649707,
+      "learning_rate": 4.679418026949418e-05,
+      "loss": 0.5549,
+      "step": 2581
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.4056262048080404,
+      "learning_rate": 4.672106262542123e-05,
+      "loss": 0.6121,
+      "step": 2582
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.34796455707915097,
+      "learning_rate": 4.664798473423496e-05,
+      "loss": 0.5963,
+      "step": 2583
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.34420643683848945,
+      "learning_rate": 4.6574946650460804e-05,
+      "loss": 0.5468,
+      "step": 2584
+    },
+    {
+      "epoch": 0.6893333333333334,
+      "grad_norm": 0.3603720102963772,
+      "learning_rate": 4.650194842859449e-05,
+      "loss": 0.6162,
+      "step": 2585
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.3720903972014294,
+      "learning_rate": 4.6428990123102014e-05,
+      "loss": 0.5746,
+      "step": 2586
+    },
+    {
+      "epoch": 0.6898666666666666,
+      "grad_norm": 0.3372115846788491,
+      "learning_rate": 4.6356071788419584e-05,
+      "loss": 0.566,
+      "step": 2587
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.39191337422988803,
+      "learning_rate": 4.6283193478953566e-05,
+      "loss": 0.5419,
+      "step": 2588
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.3548730373313539,
+      "learning_rate": 4.6210355249080505e-05,
+      "loss": 0.6079,
+      "step": 2589
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.35360031311505496,
+      "learning_rate": 4.6137557153147005e-05,
+      "loss": 0.588,
+      "step": 2590
+    },
+    {
+      "epoch": 0.6909333333333333,
+      "grad_norm": 0.33920376307324135,
+      "learning_rate": 4.606479924546977e-05,
+      "loss": 0.5963,
+      "step": 2591
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3606990133244874,
+      "learning_rate": 4.599208158033541e-05,
+      "loss": 0.6096,
+      "step": 2592
+    },
+    {
+      "epoch": 0.6914666666666667,
+      "grad_norm": 0.3423405811888361,
+      "learning_rate": 4.5919404212000614e-05,
+      "loss": 0.5978,
+      "step": 2593
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.32687460140451796,
+      "learning_rate": 4.5846767194692e-05,
+      "loss": 0.5976,
+      "step": 2594
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.32446028355999235,
+      "learning_rate": 4.577417058260602e-05,
+      "loss": 0.5903,
+      "step": 2595
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.33956085172623224,
+      "learning_rate": 4.570161442990903e-05,
+      "loss": 0.5678,
+      "step": 2596
+    },
+    {
+      "epoch": 0.6925333333333333,
+      "grad_norm": 0.35040416393857554,
+      "learning_rate": 4.562909879073719e-05,
+      "loss": 0.6134,
+      "step": 2597
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.33152674715543834,
+      "learning_rate": 4.555662371919639e-05,
+      "loss": 0.5591,
+      "step": 2598
+    },
+    {
+      "epoch": 0.6930666666666667,
+      "grad_norm": 0.4203000284758059,
+      "learning_rate": 4.548418926936234e-05,
+      "loss": 0.5617,
+      "step": 2599
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.3518319436988351,
+      "learning_rate": 4.541179549528032e-05,
+      "loss": 0.5946,
+      "step": 2600
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.3490569973230447,
+      "learning_rate": 4.533944245096533e-05,
+      "loss": 0.6086,
+      "step": 2601
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.34871934309748615,
+      "learning_rate": 4.526713019040196e-05,
+      "loss": 0.6161,
+      "step": 2602
+    },
+    {
+      "epoch": 0.6941333333333334,
+      "grad_norm": 0.3338480599616353,
+      "learning_rate": 4.519485876754446e-05,
+      "loss": 0.5946,
+      "step": 2603
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.43003597600927673,
+      "learning_rate": 4.512262823631648e-05,
+      "loss": 0.5936,
+      "step": 2604
+    },
+    {
+      "epoch": 0.6946666666666667,
+      "grad_norm": 0.34478712680790746,
+      "learning_rate": 4.505043865061124e-05,
+      "loss": 0.5458,
+      "step": 2605
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.3397244100583301,
+      "learning_rate": 4.497829006429136e-05,
+      "loss": 0.5522,
+      "step": 2606
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.3565949685173397,
+      "learning_rate": 4.490618253118895e-05,
+      "loss": 0.5763,
+      "step": 2607
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.32851689301954007,
+      "learning_rate": 4.4834116105105325e-05,
+      "loss": 0.5663,
+      "step": 2608
+    },
+    {
+      "epoch": 0.6957333333333333,
+      "grad_norm": 0.3464260479874418,
+      "learning_rate": 4.476209083981131e-05,
+      "loss": 0.5604,
+      "step": 2609
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.35022616692552616,
+      "learning_rate": 4.469010678904694e-05,
+      "loss": 0.6044,
+      "step": 2610
+    },
+    {
+      "epoch": 0.6962666666666667,
+      "grad_norm": 0.32363297510402633,
+      "learning_rate": 4.461816400652148e-05,
+      "loss": 0.6145,
+      "step": 2611
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.3456997575243997,
+      "learning_rate": 4.454626254591344e-05,
+      "loss": 0.6622,
+      "step": 2612
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.3903246172040829,
+      "learning_rate": 4.447440246087049e-05,
+      "loss": 0.6144,
+      "step": 2613
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.34037631740940844,
+      "learning_rate": 4.440258380500942e-05,
+      "loss": 0.5891,
+      "step": 2614
+    },
+    {
+      "epoch": 0.6973333333333334,
+      "grad_norm": 0.34431456055421017,
+      "learning_rate": 4.433080663191615e-05,
+      "loss": 0.5682,
+      "step": 2615
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3474196486075847,
+      "learning_rate": 4.4259070995145544e-05,
+      "loss": 0.6116,
+      "step": 2616
+    },
+    {
+      "epoch": 0.6978666666666666,
+      "grad_norm": 0.3560638134351646,
+      "learning_rate": 4.418737694822156e-05,
+      "loss": 0.6119,
+      "step": 2617
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.34518385346353603,
+      "learning_rate": 4.4115724544637124e-05,
+      "loss": 0.6239,
+      "step": 2618
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.3732215335042505,
+      "learning_rate": 4.4044113837854074e-05,
+      "loss": 0.6278,
+      "step": 2619
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.334373039075218,
+      "learning_rate": 4.397254488130312e-05,
+      "loss": 0.5811,
+      "step": 2620
+    },
+    {
+      "epoch": 0.6989333333333333,
+      "grad_norm": 0.36055205242008437,
+      "learning_rate": 4.390101772838385e-05,
+      "loss": 0.6168,
+      "step": 2621
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.34268108833532807,
+      "learning_rate": 4.382953243246465e-05,
+      "loss": 0.5514,
+      "step": 2622
+    },
+    {
+      "epoch": 0.6994666666666667,
+      "grad_norm": 0.3522581780460182,
+      "learning_rate": 4.3758089046882664e-05,
+      "loss": 0.6305,
+      "step": 2623
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.3190086924293713,
+      "learning_rate": 4.368668762494379e-05,
+      "loss": 0.5638,
+      "step": 2624
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.3781872459672398,
+      "learning_rate": 4.361532821992258e-05,
+      "loss": 0.5819,
+      "step": 2625
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.3292043824104552,
+      "learning_rate": 4.354401088506227e-05,
+      "loss": 0.5732,
+      "step": 2626
+    },
+    {
+      "epoch": 0.7005333333333333,
+      "grad_norm": 0.3438481638598106,
+      "learning_rate": 4.347273567357469e-05,
+      "loss": 0.5682,
+      "step": 2627
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3655178951282547,
+      "learning_rate": 4.340150263864024e-05,
+      "loss": 0.5838,
+      "step": 2628
+    },
+    {
+      "epoch": 0.7010666666666666,
+      "grad_norm": 0.3214380053326501,
+      "learning_rate": 4.333031183340788e-05,
+      "loss": 0.5686,
+      "step": 2629
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.3504043643306748,
+      "learning_rate": 4.3259163310995e-05,
+      "loss": 0.5956,
+      "step": 2630
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.3202911831798422,
+      "learning_rate": 4.3188057124487534e-05,
+      "loss": 0.5668,
+      "step": 2631
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.34591908224035883,
+      "learning_rate": 4.31169933269397e-05,
+      "loss": 0.6084,
+      "step": 2632
+    },
+    {
+      "epoch": 0.7021333333333334,
+      "grad_norm": 0.32118578511432455,
+      "learning_rate": 4.304597197137419e-05,
+      "loss": 0.5919,
+      "step": 2633
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.3365449968041733,
+      "learning_rate": 4.297499311078199e-05,
+      "loss": 0.5793,
+      "step": 2634
+    },
+    {
+      "epoch": 0.7026666666666667,
+      "grad_norm": 0.35972867136611175,
+      "learning_rate": 4.2904056798122406e-05,
+      "loss": 0.5699,
+      "step": 2635
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.3528624894925796,
+      "learning_rate": 4.283316308632297e-05,
+      "loss": 0.58,
+      "step": 2636
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.33220790004422346,
+      "learning_rate": 4.276231202827944e-05,
+      "loss": 0.5745,
+      "step": 2637
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.32672276059322863,
+      "learning_rate": 4.269150367685575e-05,
+      "loss": 0.5872,
+      "step": 2638
+    },
+    {
+      "epoch": 0.7037333333333333,
+      "grad_norm": 0.341027566678748,
+      "learning_rate": 4.262073808488397e-05,
+      "loss": 0.5641,
+      "step": 2639
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3386046463377781,
+      "learning_rate": 4.2550015305164245e-05,
+      "loss": 0.5892,
+      "step": 2640
+    },
+    {
+      "epoch": 0.7042666666666667,
+      "grad_norm": 0.4366696668404764,
+      "learning_rate": 4.2479335390464815e-05,
+      "loss": 0.5479,
+      "step": 2641
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.31959445453441143,
+      "learning_rate": 4.2408698393521906e-05,
+      "loss": 0.5856,
+      "step": 2642
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.3471296666761095,
+      "learning_rate": 4.233810436703973e-05,
+      "loss": 0.5676,
+      "step": 2643
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.34275353742735265,
+      "learning_rate": 4.226755336369046e-05,
+      "loss": 0.591,
+      "step": 2644
+    },
+    {
+      "epoch": 0.7053333333333334,
+      "grad_norm": 0.3456530260951439,
+      "learning_rate": 4.219704543611412e-05,
+      "loss": 0.6233,
+      "step": 2645
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3378058528363393,
+      "learning_rate": 4.212658063691867e-05,
+      "loss": 0.5735,
+      "step": 2646
+    },
+    {
+      "epoch": 0.7058666666666666,
+      "grad_norm": 0.3417206932496116,
+      "learning_rate": 4.2056159018679774e-05,
+      "loss": 0.6103,
+      "step": 2647
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.3231082122786425,
+      "learning_rate": 4.1985780633940985e-05,
+      "loss": 0.5614,
+      "step": 2648
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.7189689071660238,
+      "learning_rate": 4.191544553521355e-05,
+      "loss": 0.5381,
+      "step": 2649
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.36053623768076987,
+      "learning_rate": 4.184515377497643e-05,
+      "loss": 0.6074,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7069333333333333,
+      "grad_norm": 0.3486997937899941,
+      "learning_rate": 4.177490540567626e-05,
+      "loss": 0.6239,
+      "step": 2651
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.34005580084293835,
+      "learning_rate": 4.170470047972727e-05,
+      "loss": 0.5706,
+      "step": 2652
+    },
+    {
+      "epoch": 0.7074666666666667,
+      "grad_norm": 0.3506047926327882,
+      "learning_rate": 4.16345390495113e-05,
+      "loss": 0.5555,
+      "step": 2653
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.332909996257306,
+      "learning_rate": 4.1564421167377785e-05,
+      "loss": 0.5827,
+      "step": 2654
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.3283646986773106,
+      "learning_rate": 4.149434688564352e-05,
+      "loss": 0.5683,
+      "step": 2655
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.3417714108882349,
+      "learning_rate": 4.142431625659291e-05,
+      "loss": 0.5635,
+      "step": 2656
+    },
+    {
+      "epoch": 0.7085333333333333,
+      "grad_norm": 0.34258674126155625,
+      "learning_rate": 4.13543293324777e-05,
+      "loss": 0.5373,
+      "step": 2657
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3593930282034007,
+      "learning_rate": 4.128438616551714e-05,
+      "loss": 0.6112,
+      "step": 2658
+    },
+    {
+      "epoch": 0.7090666666666666,
+      "grad_norm": 0.3402599240319362,
+      "learning_rate": 4.1214486807897726e-05,
+      "loss": 0.6286,
+      "step": 2659
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.348463798971244,
+      "learning_rate": 4.1144631311773296e-05,
+      "loss": 0.5619,
+      "step": 2660
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.365445806915749,
+      "learning_rate": 4.1074819729264956e-05,
+      "loss": 0.6243,
+      "step": 2661
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.3546416460696886,
+      "learning_rate": 4.100505211246111e-05,
+      "loss": 0.5552,
+      "step": 2662
+    },
+    {
+      "epoch": 0.7101333333333333,
+      "grad_norm": 0.3322707356375658,
+      "learning_rate": 4.093532851341723e-05,
+      "loss": 0.6049,
+      "step": 2663
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.35282429367718965,
+      "learning_rate": 4.0865648984156037e-05,
+      "loss": 0.599,
+      "step": 2664
+    },
+    {
+      "epoch": 0.7106666666666667,
+      "grad_norm": 0.34663001757399187,
+      "learning_rate": 4.079601357666736e-05,
+      "loss": 0.6044,
+      "step": 2665
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.37580705308182843,
+      "learning_rate": 4.072642234290811e-05,
+      "loss": 0.6294,
+      "step": 2666
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.35212344160202014,
+      "learning_rate": 4.065687533480221e-05,
+      "loss": 0.5757,
+      "step": 2667
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.3430637332492065,
+      "learning_rate": 4.058737260424062e-05,
+      "loss": 0.5762,
+      "step": 2668
+    },
+    {
+      "epoch": 0.7117333333333333,
+      "grad_norm": 0.31636201051778423,
+      "learning_rate": 4.051791420308125e-05,
+      "loss": 0.5675,
+      "step": 2669
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.33345047326619004,
+      "learning_rate": 4.0448500183148965e-05,
+      "loss": 0.5725,
+      "step": 2670
+    },
+    {
+      "epoch": 0.7122666666666667,
+      "grad_norm": 0.3429815277484449,
+      "learning_rate": 4.037913059623539e-05,
+      "loss": 0.5731,
+      "step": 2671
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.3500791897144474,
+      "learning_rate": 4.030980549409915e-05,
+      "loss": 0.5358,
+      "step": 2672
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.32081388886237383,
+      "learning_rate": 4.02405249284656e-05,
+      "loss": 0.5809,
+      "step": 2673
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.3308509347928257,
+      "learning_rate": 4.0171288951026896e-05,
+      "loss": 0.5189,
+      "step": 2674
+    },
+    {
+      "epoch": 0.7133333333333334,
+      "grad_norm": 0.3394308605459211,
+      "learning_rate": 4.0102097613441916e-05,
+      "loss": 0.5596,
+      "step": 2675
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.34126220943791,
+      "learning_rate": 4.0032950967336214e-05,
+      "loss": 0.5916,
+      "step": 2676
+    },
+    {
+      "epoch": 0.7138666666666666,
+      "grad_norm": 0.3381538436086013,
+      "learning_rate": 3.996384906430202e-05,
+      "loss": 0.5934,
+      "step": 2677
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.3380942162126229,
+      "learning_rate": 3.989479195589817e-05,
+      "loss": 0.5583,
+      "step": 2678
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.33669057744214026,
+      "learning_rate": 3.9825779693650076e-05,
+      "loss": 0.5652,
+      "step": 2679
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.3357000295249724,
+      "learning_rate": 3.9756812329049706e-05,
+      "loss": 0.5441,
+      "step": 2680
+    },
+    {
+      "epoch": 0.7149333333333333,
+      "grad_norm": 0.3210033657002956,
+      "learning_rate": 3.968788991355552e-05,
+      "loss": 0.629,
+      "step": 2681
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.34486798191198575,
+      "learning_rate": 3.961901249859241e-05,
+      "loss": 0.5862,
+      "step": 2682
+    },
+    {
+      "epoch": 0.7154666666666667,
+      "grad_norm": 0.3351316950217347,
+      "learning_rate": 3.955018013555174e-05,
+      "loss": 0.5364,
+      "step": 2683
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.3179747319531608,
+      "learning_rate": 3.948139287579122e-05,
+      "loss": 0.5639,
+      "step": 2684
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.3359153652544578,
+      "learning_rate": 3.941265077063497e-05,
+      "loss": 0.5775,
+      "step": 2685
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.3499153314057488,
+      "learning_rate": 3.9343953871373306e-05,
+      "loss": 0.6228,
+      "step": 2686
+    },
+    {
+      "epoch": 0.7165333333333334,
+      "grad_norm": 0.333132020596246,
+      "learning_rate": 3.927530222926291e-05,
+      "loss": 0.5853,
+      "step": 2687
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3410323919520696,
+      "learning_rate": 3.9206695895526666e-05,
+      "loss": 0.5663,
+      "step": 2688
+    },
+    {
+      "epoch": 0.7170666666666666,
+      "grad_norm": 0.3453388074042059,
+      "learning_rate": 3.913813492135366e-05,
+      "loss": 0.5768,
+      "step": 2689
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.3379574720399497,
+      "learning_rate": 3.9069619357899137e-05,
+      "loss": 0.5388,
+      "step": 2690
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.3321920020102771,
+      "learning_rate": 3.900114925628443e-05,
+      "loss": 0.5331,
+      "step": 2691
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.3423234332914018,
+      "learning_rate": 3.8932724667597e-05,
+      "loss": 0.5675,
+      "step": 2692
+    },
+    {
+      "epoch": 0.7181333333333333,
+      "grad_norm": 0.34627067332119427,
+      "learning_rate": 3.8864345642890285e-05,
+      "loss": 0.5866,
+      "step": 2693
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3481216742300727,
+      "learning_rate": 3.879601223318381e-05,
+      "loss": 0.541,
+      "step": 2694
+    },
+    {
+      "epoch": 0.7186666666666667,
+      "grad_norm": 0.3533724827916215,
+      "learning_rate": 3.872772448946298e-05,
+      "loss": 0.5586,
+      "step": 2695
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.3409493010942156,
+      "learning_rate": 3.8659482462679186e-05,
+      "loss": 0.5874,
+      "step": 2696
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.33703755634795984,
+      "learning_rate": 3.8591286203749675e-05,
+      "loss": 0.6389,
+      "step": 2697
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.33642926964243336,
+      "learning_rate": 3.8523135763557586e-05,
+      "loss": 0.5694,
+      "step": 2698
+    },
+    {
+      "epoch": 0.7197333333333333,
+      "grad_norm": 0.36002017706901657,
+      "learning_rate": 3.845503119295182e-05,
+      "loss": 0.5922,
+      "step": 2699
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.33680011289406897,
+      "learning_rate": 3.838697254274708e-05,
+      "loss": 0.6074,
+      "step": 2700
+    },
+    {
+      "epoch": 0.7202666666666667,
+      "grad_norm": 0.3551819088159809,
+      "learning_rate": 3.8318959863723845e-05,
+      "loss": 0.6345,
+      "step": 2701
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.339625447035895,
+      "learning_rate": 3.8250993206628196e-05,
+      "loss": 0.6164,
+      "step": 2702
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.33428656437795423,
+      "learning_rate": 3.8183072622171945e-05,
+      "loss": 0.5761,
+      "step": 2703
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.33766082620220056,
+      "learning_rate": 3.811519816103253e-05,
+      "loss": 0.5506,
+      "step": 2704
+    },
+    {
+      "epoch": 0.7213333333333334,
+      "grad_norm": 0.32835872325437937,
+      "learning_rate": 3.804736987385296e-05,
+      "loss": 0.5544,
+      "step": 2705
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.32836417353156705,
+      "learning_rate": 3.7979587811241776e-05,
+      "loss": 0.5823,
+      "step": 2706
+    },
+    {
+      "epoch": 0.7218666666666667,
+      "grad_norm": 0.3247183003730969,
+      "learning_rate": 3.791185202377308e-05,
+      "loss": 0.5952,
+      "step": 2707
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.34706684943085486,
+      "learning_rate": 3.7844162561986386e-05,
+      "loss": 0.5767,
+      "step": 2708
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.36355256307156913,
+      "learning_rate": 3.777651947638672e-05,
+      "loss": 0.5756,
+      "step": 2709
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.34411426320034616,
+      "learning_rate": 3.770892281744438e-05,
+      "loss": 0.589,
+      "step": 2710
+    },
+    {
+      "epoch": 0.7229333333333333,
+      "grad_norm": 0.32949175673002024,
+      "learning_rate": 3.764137263559514e-05,
+      "loss": 0.5701,
+      "step": 2711
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3386470837306449,
+      "learning_rate": 3.7573868981240055e-05,
+      "loss": 0.5772,
+      "step": 2712
+    },
+    {
+      "epoch": 0.7234666666666667,
+      "grad_norm": 0.328897677315379,
+      "learning_rate": 3.750641190474543e-05,
+      "loss": 0.5869,
+      "step": 2713
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.3441502977221793,
+      "learning_rate": 3.743900145644292e-05,
+      "loss": 0.5845,
+      "step": 2714
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.3245188830355559,
+      "learning_rate": 3.737163768662929e-05,
+      "loss": 0.6009,
+      "step": 2715
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.368042509101508,
+      "learning_rate": 3.730432064556655e-05,
+      "loss": 0.5984,
+      "step": 2716
+    },
+    {
+      "epoch": 0.7245333333333334,
+      "grad_norm": 0.3092323120645295,
+      "learning_rate": 3.723705038348172e-05,
+      "loss": 0.5397,
+      "step": 2717
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.35887390833634286,
+      "learning_rate": 3.716982695056705e-05,
+      "loss": 0.5779,
+      "step": 2718
+    },
+    {
+      "epoch": 0.7250666666666666,
+      "grad_norm": 0.36867587051516315,
+      "learning_rate": 3.7102650396979775e-05,
+      "loss": 0.603,
+      "step": 2719
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.3504788968425937,
+      "learning_rate": 3.7035520772842215e-05,
+      "loss": 0.6275,
+      "step": 2720
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.35438892006742084,
+      "learning_rate": 3.69684381282416e-05,
+      "loss": 0.5881,
+      "step": 2721
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.34325493506533183,
+      "learning_rate": 3.6901402513230176e-05,
+      "loss": 0.5949,
+      "step": 2722
+    },
+    {
+      "epoch": 0.7261333333333333,
+      "grad_norm": 0.3180607835872949,
+      "learning_rate": 3.683441397782504e-05,
+      "loss": 0.5316,
+      "step": 2723
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3371022609370693,
+      "learning_rate": 3.676747257200824e-05,
+      "loss": 0.6321,
+      "step": 2724
+    },
+    {
+      "epoch": 0.7266666666666667,
+      "grad_norm": 0.34294720467767303,
+      "learning_rate": 3.670057834572653e-05,
+      "loss": 0.6166,
+      "step": 2725
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.34601852533763955,
+      "learning_rate": 3.6633731348891576e-05,
+      "loss": 0.5642,
+      "step": 2726
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.3291346370127747,
+      "learning_rate": 3.656693163137978e-05,
+      "loss": 0.5935,
+      "step": 2727
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.3482549678259148,
+      "learning_rate": 3.650017924303223e-05,
+      "loss": 0.5638,
+      "step": 2728
+    },
+    {
+      "epoch": 0.7277333333333333,
+      "grad_norm": 0.35577092272633354,
+      "learning_rate": 3.6433474233654755e-05,
+      "loss": 0.5621,
+      "step": 2729
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3500587318617331,
+      "learning_rate": 3.636681665301779e-05,
+      "loss": 0.6056,
+      "step": 2730
+    },
+    {
+      "epoch": 0.7282666666666666,
+      "grad_norm": 0.3259015848550714,
+      "learning_rate": 3.630020655085638e-05,
+      "loss": 0.5866,
+      "step": 2731
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.3421158499942335,
+      "learning_rate": 3.623364397687021e-05,
+      "loss": 0.5913,
+      "step": 2732
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.3527844139938932,
+      "learning_rate": 3.616712898072341e-05,
+      "loss": 0.6093,
+      "step": 2733
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.3540040986675042,
+      "learning_rate": 3.6100661612044674e-05,
+      "loss": 0.6377,
+      "step": 2734
+    },
+    {
+      "epoch": 0.7293333333333333,
+      "grad_norm": 0.33951088152217124,
+      "learning_rate": 3.6034241920427146e-05,
+      "loss": 0.586,
+      "step": 2735
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.36040966386816387,
+      "learning_rate": 3.596786995542838e-05,
+      "loss": 0.575,
+      "step": 2736
+    },
+    {
+      "epoch": 0.7298666666666667,
+      "grad_norm": 0.35036495557156944,
+      "learning_rate": 3.590154576657033e-05,
+      "loss": 0.5708,
+      "step": 2737
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.3402095274506431,
+      "learning_rate": 3.583526940333932e-05,
+      "loss": 0.5944,
+      "step": 2738
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.31256515109998495,
+      "learning_rate": 3.576904091518597e-05,
+      "loss": 0.5227,
+      "step": 2739
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.3341459090386711,
+      "learning_rate": 3.5702860351525216e-05,
+      "loss": 0.6042,
+      "step": 2740
+    },
+    {
+      "epoch": 0.7309333333333333,
+      "grad_norm": 0.321061397417268,
+      "learning_rate": 3.563672776173613e-05,
+      "loss": 0.559,
+      "step": 2741
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3302288284779727,
+      "learning_rate": 3.557064319516211e-05,
+      "loss": 0.5772,
+      "step": 2742
+    },
+    {
+      "epoch": 0.7314666666666667,
+      "grad_norm": 0.3587254813301968,
+      "learning_rate": 3.5504606701110674e-05,
+      "loss": 0.5616,
+      "step": 2743
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.33612652088475176,
+      "learning_rate": 3.5438618328853466e-05,
+      "loss": 0.5686,
+      "step": 2744
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.32337776100716026,
+      "learning_rate": 3.5372678127626236e-05,
+      "loss": 0.5622,
+      "step": 2745
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.31930456103097127,
+      "learning_rate": 3.5306786146628803e-05,
+      "loss": 0.5552,
+      "step": 2746
+    },
+    {
+      "epoch": 0.7325333333333334,
+      "grad_norm": 0.3533114791890799,
+      "learning_rate": 3.524094243502497e-05,
+      "loss": 0.5946,
+      "step": 2747
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3349334612823938,
+      "learning_rate": 3.517514704194256e-05,
+      "loss": 0.5825,
+      "step": 2748
+    },
+    {
+      "epoch": 0.7330666666666666,
+      "grad_norm": 0.35261757290925116,
+      "learning_rate": 3.510940001647334e-05,
+      "loss": 0.6046,
+      "step": 2749
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.3373037734584713,
+      "learning_rate": 3.504370140767297e-05,
+      "loss": 0.6056,
+      "step": 2750
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.3353647503081956,
+      "learning_rate": 3.4978051264561e-05,
+      "loss": 0.5732,
+      "step": 2751
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.3520589217906225,
+      "learning_rate": 3.491244963612082e-05,
+      "loss": 0.5711,
+      "step": 2752
+    },
+    {
+      "epoch": 0.7341333333333333,
+      "grad_norm": 0.3543274292844667,
+      "learning_rate": 3.4846896571299615e-05,
+      "loss": 0.6096,
+      "step": 2753
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.36631220619876814,
+      "learning_rate": 3.478139211900833e-05,
+      "loss": 0.6253,
+      "step": 2754
+    },
+    {
+      "epoch": 0.7346666666666667,
+      "grad_norm": 0.5896318232010979,
+      "learning_rate": 3.471593632812169e-05,
+      "loss": 0.5979,
+      "step": 2755
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.34658342927525737,
+      "learning_rate": 3.465052924747799e-05,
+      "loss": 0.6239,
+      "step": 2756
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.33573335206497024,
+      "learning_rate": 3.458517092587931e-05,
+      "loss": 0.5616,
+      "step": 2757
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.3322460637850616,
+      "learning_rate": 3.451986141209128e-05,
+      "loss": 0.6083,
+      "step": 2758
+    },
+    {
+      "epoch": 0.7357333333333334,
+      "grad_norm": 0.3606929380552638,
+      "learning_rate": 3.445460075484315e-05,
+      "loss": 0.6021,
+      "step": 2759
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.33386289228713206,
+      "learning_rate": 3.438938900282768e-05,
+      "loss": 0.5762,
+      "step": 2760
+    },
+    {
+      "epoch": 0.7362666666666666,
+      "grad_norm": 0.3754182581468279,
+      "learning_rate": 3.432422620470117e-05,
+      "loss": 0.5654,
+      "step": 2761
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.3580448255914008,
+      "learning_rate": 3.425911240908338e-05,
+      "loss": 0.538,
+      "step": 2762
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.348729716974533,
+      "learning_rate": 3.419404766455755e-05,
+      "loss": 0.5977,
+      "step": 2763
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.36504385827184765,
+      "learning_rate": 3.412903201967022e-05,
+      "loss": 0.5955,
+      "step": 2764
+    },
+    {
+      "epoch": 0.7373333333333333,
+      "grad_norm": 0.36453391448094125,
+      "learning_rate": 3.4064065522931364e-05,
+      "loss": 0.5864,
+      "step": 2765
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.34426344395579583,
+      "learning_rate": 3.3999148222814316e-05,
+      "loss": 0.5753,
+      "step": 2766
+    },
+    {
+      "epoch": 0.7378666666666667,
+      "grad_norm": 0.3516850748631634,
+      "learning_rate": 3.393428016775565e-05,
+      "loss": 0.5759,
+      "step": 2767
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.3628796219858617,
+      "learning_rate": 3.386946140615517e-05,
+      "loss": 0.6167,
+      "step": 2768
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.33271508301085917,
+      "learning_rate": 3.3804691986376034e-05,
+      "loss": 0.5604,
+      "step": 2769
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.34987970715201805,
+      "learning_rate": 3.373997195674444e-05,
+      "loss": 0.5855,
+      "step": 2770
+    },
+    {
+      "epoch": 0.7389333333333333,
+      "grad_norm": 0.4009078897492756,
+      "learning_rate": 3.367530136554984e-05,
+      "loss": 0.675,
+      "step": 2771
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.355531287267289,
+      "learning_rate": 3.361068026104466e-05,
+      "loss": 0.5919,
+      "step": 2772
+    },
+    {
+      "epoch": 0.7394666666666667,
+      "grad_norm": 0.36164773264924877,
+      "learning_rate": 3.3546108691444544e-05,
+      "loss": 0.5489,
+      "step": 2773
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.35484373258109436,
+      "learning_rate": 3.3481586704928123e-05,
+      "loss": 0.575,
+      "step": 2774
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.3603856426092299,
+      "learning_rate": 3.341711434963703e-05,
+      "loss": 0.5583,
+      "step": 2775
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.35102099841871875,
+      "learning_rate": 3.335269167367586e-05,
+      "loss": 0.5972,
+      "step": 2776
+    },
+    {
+      "epoch": 0.7405333333333334,
+      "grad_norm": 0.35954375592491183,
+      "learning_rate": 3.328831872511216e-05,
+      "loss": 0.6176,
+      "step": 2777
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3203867630802601,
+      "learning_rate": 3.3223995551976364e-05,
+      "loss": 0.5894,
+      "step": 2778
+    },
+    {
+      "epoch": 0.7410666666666667,
+      "grad_norm": 0.3418286289943729,
+      "learning_rate": 3.315972220226179e-05,
+      "loss": 0.5756,
+      "step": 2779
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.3400885588179178,
+      "learning_rate": 3.309549872392451e-05,
+      "loss": 0.5612,
+      "step": 2780
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.3588257717229326,
+      "learning_rate": 3.3031325164883466e-05,
+      "loss": 0.5906,
+      "step": 2781
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.339092122833708,
+      "learning_rate": 3.296720157302031e-05,
+      "loss": 0.5604,
+      "step": 2782
+    },
+    {
+      "epoch": 0.7421333333333333,
+      "grad_norm": 0.3808127497745609,
+      "learning_rate": 3.290312799617944e-05,
+      "loss": 0.6332,
+      "step": 2783
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3393912738713724,
+      "learning_rate": 3.2839104482167914e-05,
+      "loss": 0.5602,
+      "step": 2784
+    },
+    {
+      "epoch": 0.7426666666666667,
+      "grad_norm": 0.3683834269616866,
+      "learning_rate": 3.277513107875544e-05,
+      "loss": 0.6171,
+      "step": 2785
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.3310670819530194,
+      "learning_rate": 3.271120783367435e-05,
+      "loss": 0.5278,
+      "step": 2786
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.3445808893867133,
+      "learning_rate": 3.264733479461953e-05,
+      "loss": 0.577,
+      "step": 2787
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.36301230687627056,
+      "learning_rate": 3.258351200924844e-05,
+      "loss": 0.6084,
+      "step": 2788
+    },
+    {
+      "epoch": 0.7437333333333334,
+      "grad_norm": 0.3449804768380338,
+      "learning_rate": 3.2519739525181007e-05,
+      "loss": 0.5664,
+      "step": 2789
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.34825749563458047,
+      "learning_rate": 3.245601738999964e-05,
+      "loss": 0.5736,
+      "step": 2790
+    },
+    {
+      "epoch": 0.7442666666666666,
+      "grad_norm": 0.34240059510136145,
+      "learning_rate": 3.239234565124919e-05,
+      "loss": 0.5492,
+      "step": 2791
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.3562829720005136,
+      "learning_rate": 3.23287243564369e-05,
+      "loss": 0.5861,
+      "step": 2792
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.3223001854588283,
+      "learning_rate": 3.226515355303237e-05,
+      "loss": 0.5926,
+      "step": 2793
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.3206364442570673,
+      "learning_rate": 3.220163328846757e-05,
+      "loss": 0.5691,
+      "step": 2794
+    },
+    {
+      "epoch": 0.7453333333333333,
+      "grad_norm": 0.3571916770068726,
+      "learning_rate": 3.2138163610136665e-05,
+      "loss": 0.6011,
+      "step": 2795
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3550758504410085,
+      "learning_rate": 3.207474456539615e-05,
+      "loss": 0.5645,
+      "step": 2796
+    },
+    {
+      "epoch": 0.7458666666666667,
+      "grad_norm": 0.3429306121438903,
+      "learning_rate": 3.201137620156475e-05,
+      "loss": 0.5494,
+      "step": 2797
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.3578267104696251,
+      "learning_rate": 3.1948058565923324e-05,
+      "loss": 0.6124,
+      "step": 2798
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.35810356477423677,
+      "learning_rate": 3.188479170571493e-05,
+      "loss": 0.5364,
+      "step": 2799
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.3433763452975484,
+      "learning_rate": 3.182157566814471e-05,
+      "loss": 0.5747,
+      "step": 2800
+    },
+    {
+      "epoch": 0.7469333333333333,
+      "grad_norm": 0.3419644464151213,
+      "learning_rate": 3.17584105003799e-05,
+      "loss": 0.5917,
+      "step": 2801
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.32329900358403585,
+      "learning_rate": 3.16952962495498e-05,
+      "loss": 0.5778,
+      "step": 2802
+    },
+    {
+      "epoch": 0.7474666666666666,
+      "grad_norm": 0.32813276016351867,
+      "learning_rate": 3.163223296274561e-05,
+      "loss": 0.5294,
+      "step": 2803
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.34099670470381055,
+      "learning_rate": 3.1569220687020675e-05,
+      "loss": 0.5689,
+      "step": 2804
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.35336381033368786,
+      "learning_rate": 3.1506259469390173e-05,
+      "loss": 0.6004,
+      "step": 2805
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.4062749773569371,
+      "learning_rate": 3.144334935683121e-05,
+      "loss": 0.5738,
+      "step": 2806
+    },
+    {
+      "epoch": 0.7485333333333334,
+      "grad_norm": 0.32575080749610674,
+      "learning_rate": 3.138049039628273e-05,
+      "loss": 0.5714,
+      "step": 2807
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3572483766931299,
+      "learning_rate": 3.1317682634645586e-05,
+      "loss": 0.6132,
+      "step": 2808
+    },
+    {
+      "epoch": 0.7490666666666667,
+      "grad_norm": 0.3327769902717611,
+      "learning_rate": 3.1254926118782346e-05,
+      "loss": 0.57,
+      "step": 2809
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.32477417536389414,
+      "learning_rate": 3.119222089551743e-05,
+      "loss": 0.5952,
+      "step": 2810
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.4144458561520922,
+      "learning_rate": 3.1129567011636875e-05,
+      "loss": 0.5897,
+      "step": 2811
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.33359936621612046,
+      "learning_rate": 3.1066964513888486e-05,
+      "loss": 0.5574,
+      "step": 2812
+    },
+    {
+      "epoch": 0.7501333333333333,
+      "grad_norm": 0.34162445655538076,
+      "learning_rate": 3.1004413448981726e-05,
+      "loss": 0.5775,
+      "step": 2813
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.3745261818377158,
+      "learning_rate": 3.094191386358768e-05,
+      "loss": 0.6403,
+      "step": 2814
+    },
+    {
+      "epoch": 0.7506666666666667,
+      "grad_norm": 0.3381915213786633,
+      "learning_rate": 3.0879465804339016e-05,
+      "loss": 0.5716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.3553817135977034,
+      "learning_rate": 3.081706931782994e-05,
+      "loss": 0.5936,
+      "step": 2816
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.3392212116679093,
+      "learning_rate": 3.07547244506162e-05,
+      "loss": 0.5675,
+      "step": 2817
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.3419158329222798,
+      "learning_rate": 3.069243124921507e-05,
+      "loss": 0.5904,
+      "step": 2818
+    },
+    {
+      "epoch": 0.7517333333333334,
+      "grad_norm": 0.362236353952747,
+      "learning_rate": 3.063018976010514e-05,
+      "loss": 0.6215,
+      "step": 2819
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.32366659489489924,
+      "learning_rate": 3.056800002972655e-05,
+      "loss": 0.6029,
+      "step": 2820
+    },
+    {
+      "epoch": 0.7522666666666666,
+      "grad_norm": 0.3327667473773849,
+      "learning_rate": 3.0505862104480787e-05,
+      "loss": 0.5427,
+      "step": 2821
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.333461734242426,
+      "learning_rate": 3.0443776030730653e-05,
+      "loss": 0.5664,
+      "step": 2822
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.3528611971703985,
+      "learning_rate": 3.0381741854800283e-05,
+      "loss": 0.5916,
+      "step": 2823
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.3432974171601048,
+      "learning_rate": 3.0319759622975062e-05,
+      "loss": 0.5779,
+      "step": 2824
+    },
+    {
+      "epoch": 0.7533333333333333,
+      "grad_norm": 0.3359747156313507,
+      "learning_rate": 3.0257829381501725e-05,
+      "loss": 0.5797,
+      "step": 2825
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.33007529909848426,
+      "learning_rate": 3.019595117658811e-05,
+      "loss": 0.5848,
+      "step": 2826
+    },
+    {
+      "epoch": 0.7538666666666667,
+      "grad_norm": 0.33089153643306335,
+      "learning_rate": 3.0134125054403207e-05,
+      "loss": 0.6071,
+      "step": 2827
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.35421218210284616,
+      "learning_rate": 3.0072351061077208e-05,
+      "loss": 0.6004,
+      "step": 2828
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.34489451227933937,
+      "learning_rate": 3.0010629242701417e-05,
+      "loss": 0.5913,
+      "step": 2829
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.3467533873523663,
+      "learning_rate": 2.9948959645328177e-05,
+      "loss": 0.5669,
+      "step": 2830
+    },
+    {
+      "epoch": 0.7549333333333333,
+      "grad_norm": 0.3348711053997264,
+      "learning_rate": 2.9887342314970878e-05,
+      "loss": 0.5327,
+      "step": 2831
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.33545208915497077,
+      "learning_rate": 2.982577729760392e-05,
+      "loss": 0.5621,
+      "step": 2832
+    },
+    {
+      "epoch": 0.7554666666666666,
+      "grad_norm": 0.3596487109307188,
+      "learning_rate": 2.9764264639162677e-05,
+      "loss": 0.6048,
+      "step": 2833
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.4636999773283181,
+      "learning_rate": 2.970280438554339e-05,
+      "loss": 0.6043,
+      "step": 2834
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.34265894916271833,
+      "learning_rate": 2.9641396582603288e-05,
+      "loss": 0.5726,
+      "step": 2835
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.34947764799702663,
+      "learning_rate": 2.958004127616042e-05,
+      "loss": 0.5698,
+      "step": 2836
+    },
+    {
+      "epoch": 0.7565333333333333,
+      "grad_norm": 0.3363454553598191,
+      "learning_rate": 2.9518738511993683e-05,
+      "loss": 0.5546,
+      "step": 2837
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.3735086429816798,
+      "learning_rate": 2.9457488335842754e-05,
+      "loss": 0.5699,
+      "step": 2838
+    },
+    {
+      "epoch": 0.7570666666666667,
+      "grad_norm": 0.35097422993851685,
+      "learning_rate": 2.939629079340809e-05,
+      "loss": 0.5462,
+      "step": 2839
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.33511453700242655,
+      "learning_rate": 2.9335145930350848e-05,
+      "loss": 0.5861,
+      "step": 2840
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.3448492542930926,
+      "learning_rate": 2.927405379229292e-05,
+      "loss": 0.5544,
+      "step": 2841
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.3407360241982918,
+      "learning_rate": 2.9213014424816843e-05,
+      "loss": 0.5725,
+      "step": 2842
+    },
+    {
+      "epoch": 0.7581333333333333,
+      "grad_norm": 0.34268534583443067,
+      "learning_rate": 2.9152027873465747e-05,
+      "loss": 0.5569,
+      "step": 2843
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.35369337878939944,
+      "learning_rate": 2.9091094183743405e-05,
+      "loss": 0.5904,
+      "step": 2844
+    },
+    {
+      "epoch": 0.7586666666666667,
+      "grad_norm": 0.3544851327470538,
+      "learning_rate": 2.9030213401114127e-05,
+      "loss": 0.568,
+      "step": 2845
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.3664544687067838,
+      "learning_rate": 2.8969385571002728e-05,
+      "loss": 0.591,
+      "step": 2846
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.3366042280225982,
+      "learning_rate": 2.890861073879454e-05,
+      "loss": 0.5307,
+      "step": 2847
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.36078579440370795,
+      "learning_rate": 2.8847888949835357e-05,
+      "loss": 0.6049,
+      "step": 2848
+    },
+    {
+      "epoch": 0.7597333333333334,
+      "grad_norm": 0.33135072310074737,
+      "learning_rate": 2.878722024943139e-05,
+      "loss": 0.5966,
+      "step": 2849
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3446854577131219,
+      "learning_rate": 2.872660468284919e-05,
+      "loss": 0.5407,
+      "step": 2850
+    },
+    {
+      "epoch": 0.7602666666666666,
+      "grad_norm": 0.3452087501658542,
+      "learning_rate": 2.866604229531573e-05,
+      "loss": 0.5547,
+      "step": 2851
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.35852385022308303,
+      "learning_rate": 2.860553313201828e-05,
+      "loss": 0.6115,
+      "step": 2852
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.33154218446520856,
+      "learning_rate": 2.854507723810439e-05,
+      "loss": 0.6102,
+      "step": 2853
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.33192405887435444,
+      "learning_rate": 2.8484674658681887e-05,
+      "loss": 0.551,
+      "step": 2854
+    },
+    {
+      "epoch": 0.7613333333333333,
+      "grad_norm": 0.36128934018441156,
+      "learning_rate": 2.8424325438818798e-05,
+      "loss": 0.5719,
+      "step": 2855
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3482992086305832,
+      "learning_rate": 2.8364029623543342e-05,
+      "loss": 0.5652,
+      "step": 2856
+    },
+    {
+      "epoch": 0.7618666666666667,
+      "grad_norm": 0.35876474657284396,
+      "learning_rate": 2.8303787257843917e-05,
+      "loss": 0.5911,
+      "step": 2857
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.33025684531166216,
+      "learning_rate": 2.8243598386668924e-05,
+      "loss": 0.5706,
+      "step": 2858
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.3370857793562149,
+      "learning_rate": 2.8183463054927052e-05,
+      "loss": 0.5878,
+      "step": 2859
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.3416894819537489,
+      "learning_rate": 2.8123381307486872e-05,
+      "loss": 0.5711,
+      "step": 2860
+    },
+    {
+      "epoch": 0.7629333333333334,
+      "grad_norm": 0.3341183247416704,
+      "learning_rate": 2.8063353189177065e-05,
+      "loss": 0.5497,
+      "step": 2861
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.35025357015705827,
+      "learning_rate": 2.8003378744786245e-05,
+      "loss": 0.5404,
+      "step": 2862
+    },
+    {
+      "epoch": 0.7634666666666666,
+      "grad_norm": 0.3406201446929645,
+      "learning_rate": 2.7943458019063018e-05,
+      "loss": 0.5852,
+      "step": 2863
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.35764609510961687,
+      "learning_rate": 2.7883591056715887e-05,
+      "loss": 0.568,
+      "step": 2864
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.3555753793060286,
+      "learning_rate": 2.7823777902413272e-05,
+      "loss": 0.5547,
+      "step": 2865
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.3417393649148862,
+      "learning_rate": 2.776401860078337e-05,
+      "loss": 0.5791,
+      "step": 2866
+    },
+    {
+      "epoch": 0.7645333333333333,
+      "grad_norm": 0.4187043470260362,
+      "learning_rate": 2.7704313196414266e-05,
+      "loss": 0.6243,
+      "step": 2867
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3367323662310071,
+      "learning_rate": 2.7644661733853804e-05,
+      "loss": 0.5751,
+      "step": 2868
+    },
+    {
+      "epoch": 0.7650666666666667,
+      "grad_norm": 0.33716347882532866,
+      "learning_rate": 2.7585064257609607e-05,
+      "loss": 0.5587,
+      "step": 2869
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.3488560446831909,
+      "learning_rate": 2.7525520812148987e-05,
+      "loss": 0.6056,
+      "step": 2870
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.3501423965078887,
+      "learning_rate": 2.7466031441898955e-05,
+      "loss": 0.604,
+      "step": 2871
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.35556419708512693,
+      "learning_rate": 2.7406596191246204e-05,
+      "loss": 0.5783,
+      "step": 2872
+    },
+    {
+      "epoch": 0.7661333333333333,
+      "grad_norm": 0.3336847500086423,
+      "learning_rate": 2.734721510453695e-05,
+      "loss": 0.5423,
+      "step": 2873
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.34399136002504593,
+      "learning_rate": 2.7287888226077106e-05,
+      "loss": 0.572,
+      "step": 2874
+    },
+    {
+      "epoch": 0.7666666666666667,
+      "grad_norm": 0.35364208377429673,
+      "learning_rate": 2.722861560013208e-05,
+      "loss": 0.6032,
+      "step": 2875
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.34528890196523426,
+      "learning_rate": 2.716939727092682e-05,
+      "loss": 0.5994,
+      "step": 2876
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.37059081484690026,
+      "learning_rate": 2.7110233282645757e-05,
+      "loss": 0.5842,
+      "step": 2877
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.33942610536200135,
+      "learning_rate": 2.7051123679432776e-05,
+      "loss": 0.596,
+      "step": 2878
+    },
+    {
+      "epoch": 0.7677333333333334,
+      "grad_norm": 0.34570312362908784,
+      "learning_rate": 2.6992068505391198e-05,
+      "loss": 0.5324,
+      "step": 2879
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.37058252090350524,
+      "learning_rate": 2.693306780458369e-05,
+      "loss": 0.6221,
+      "step": 2880
+    },
+    {
+      "epoch": 0.7682666666666667,
+      "grad_norm": 0.3479922325256641,
+      "learning_rate": 2.6874121621032334e-05,
+      "loss": 0.5957,
+      "step": 2881
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.34677840493325257,
+      "learning_rate": 2.6815229998718492e-05,
+      "loss": 0.6364,
+      "step": 2882
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.3397310545189047,
+      "learning_rate": 2.6756392981582835e-05,
+      "loss": 0.6255,
+      "step": 2883
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.3542887275124184,
+      "learning_rate": 2.669761061352527e-05,
+      "loss": 0.6087,
+      "step": 2884
+    },
+    {
+      "epoch": 0.7693333333333333,
+      "grad_norm": 0.34850317816204396,
+      "learning_rate": 2.6638882938404964e-05,
+      "loss": 0.5781,
+      "step": 2885
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.32586211658065056,
+      "learning_rate": 2.6580210000040252e-05,
+      "loss": 0.5624,
+      "step": 2886
+    },
+    {
+      "epoch": 0.7698666666666667,
+      "grad_norm": 0.3262727265793471,
+      "learning_rate": 2.6521591842208636e-05,
+      "loss": 0.6027,
+      "step": 2887
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.3805206490097053,
+      "learning_rate": 2.646302850864677e-05,
+      "loss": 0.5928,
+      "step": 2888
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.4774418088688516,
+      "learning_rate": 2.6404520043050316e-05,
+      "loss": 0.5621,
+      "step": 2889
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.3590649841143849,
+      "learning_rate": 2.6346066489074085e-05,
+      "loss": 0.6163,
+      "step": 2890
+    },
+    {
+      "epoch": 0.7709333333333334,
+      "grad_norm": 0.34624724191296896,
+      "learning_rate": 2.628766789033188e-05,
+      "loss": 0.592,
+      "step": 2891
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.35398341996013827,
+      "learning_rate": 2.6229324290396517e-05,
+      "loss": 0.5896,
+      "step": 2892
+    },
+    {
+      "epoch": 0.7714666666666666,
+      "grad_norm": 0.34491124969305736,
+      "learning_rate": 2.6171035732799766e-05,
+      "loss": 0.5893,
+      "step": 2893
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.3273373885550423,
+      "learning_rate": 2.6112802261032333e-05,
+      "loss": 0.5547,
+      "step": 2894
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.35819953877359534,
+      "learning_rate": 2.6054623918543818e-05,
+      "loss": 0.5951,
+      "step": 2895
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.3593055667000119,
+      "learning_rate": 2.5996500748742693e-05,
+      "loss": 0.5758,
+      "step": 2896
+    },
+    {
+      "epoch": 0.7725333333333333,
+      "grad_norm": 0.3500869697127036,
+      "learning_rate": 2.5938432794996247e-05,
+      "loss": 0.5674,
+      "step": 2897
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.3337593031098043,
+      "learning_rate": 2.58804201006306e-05,
+      "loss": 0.5221,
+      "step": 2898
+    },
+    {
+      "epoch": 0.7730666666666667,
+      "grad_norm": 0.31726377157544783,
+      "learning_rate": 2.5822462708930607e-05,
+      "loss": 0.5316,
+      "step": 2899
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.38309907455470354,
+      "learning_rate": 2.5764560663139893e-05,
+      "loss": 0.6158,
+      "step": 2900
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.34890471898909503,
+      "learning_rate": 2.5706714006460775e-05,
+      "loss": 0.5539,
+      "step": 2901
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.3217838755652875,
+      "learning_rate": 2.564892278205423e-05,
+      "loss": 0.5345,
+      "step": 2902
+    },
+    {
+      "epoch": 0.7741333333333333,
+      "grad_norm": 0.36001095560304475,
+      "learning_rate": 2.55911870330399e-05,
+      "loss": 0.5807,
+      "step": 2903
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3198848270430251,
+      "learning_rate": 2.553350680249603e-05,
+      "loss": 0.5438,
+      "step": 2904
+    },
+    {
+      "epoch": 0.7746666666666666,
+      "grad_norm": 0.34487127863747363,
+      "learning_rate": 2.5475882133459404e-05,
+      "loss": 0.5775,
+      "step": 2905
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.3411786328623317,
+      "learning_rate": 2.541831306892538e-05,
+      "loss": 0.5427,
+      "step": 2906
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.3375887356805181,
+      "learning_rate": 2.5360799651847855e-05,
+      "loss": 0.5573,
+      "step": 2907
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.33238618803649345,
+      "learning_rate": 2.5303341925139157e-05,
+      "loss": 0.5826,
+      "step": 2908
+    },
+    {
+      "epoch": 0.7757333333333334,
+      "grad_norm": 0.35801528194447835,
+      "learning_rate": 2.524593993167008e-05,
+      "loss": 0.5746,
+      "step": 2909
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3280498033236464,
+      "learning_rate": 2.518859371426985e-05,
+      "loss": 0.5456,
+      "step": 2910
+    },
+    {
+      "epoch": 0.7762666666666667,
+      "grad_norm": 0.3814129159093021,
+      "learning_rate": 2.5131303315726096e-05,
+      "loss": 0.577,
+      "step": 2911
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.3665650846255579,
+      "learning_rate": 2.5074068778784687e-05,
+      "loss": 0.5574,
+      "step": 2912
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.3282645759624906,
+      "learning_rate": 2.501689014614995e-05,
+      "loss": 0.5534,
+      "step": 2913
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.3658811470179827,
+      "learning_rate": 2.4959767460484384e-05,
+      "loss": 0.6137,
+      "step": 2914
+    },
+    {
+      "epoch": 0.7773333333333333,
+      "grad_norm": 0.3623977835828062,
+      "learning_rate": 2.4902700764408883e-05,
+      "loss": 0.587,
+      "step": 2915
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3772563853133515,
+      "learning_rate": 2.484569010050244e-05,
+      "loss": 0.609,
+      "step": 2916
+    },
+    {
+      "epoch": 0.7778666666666667,
+      "grad_norm": 0.3392553477959139,
+      "learning_rate": 2.4788735511302295e-05,
+      "loss": 0.5657,
+      "step": 2917
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.3618047663803316,
+      "learning_rate": 2.473183703930384e-05,
+      "loss": 0.5901,
+      "step": 2918
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.35053861270716696,
+      "learning_rate": 2.4674994726960633e-05,
+      "loss": 0.5734,
+      "step": 2919
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.3594647809109259,
+      "learning_rate": 2.4618208616684214e-05,
+      "loss": 0.6153,
+      "step": 2920
+    },
+    {
+      "epoch": 0.7789333333333334,
+      "grad_norm": 0.32647511916596544,
+      "learning_rate": 2.45614787508443e-05,
+      "loss": 0.5601,
+      "step": 2921
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.334028580235852,
+      "learning_rate": 2.4504805171768642e-05,
+      "loss": 0.5409,
+      "step": 2922
+    },
+    {
+      "epoch": 0.7794666666666666,
+      "grad_norm": 0.38378756268180986,
+      "learning_rate": 2.4448187921742925e-05,
+      "loss": 0.5765,
+      "step": 2923
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.32324283794676945,
+      "learning_rate": 2.4391627043010855e-05,
+      "loss": 0.5333,
+      "step": 2924
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.34944296392694285,
+      "learning_rate": 2.4335122577774072e-05,
+      "loss": 0.5413,
+      "step": 2925
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.3508850592798124,
+      "learning_rate": 2.4278674568192128e-05,
+      "loss": 0.5625,
+      "step": 2926
+    },
+    {
+      "epoch": 0.7805333333333333,
+      "grad_norm": 0.3341152152889283,
+      "learning_rate": 2.4222283056382444e-05,
+      "loss": 0.607,
+      "step": 2927
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.33756049390002807,
+      "learning_rate": 2.4165948084420243e-05,
+      "loss": 0.5327,
+      "step": 2928
+    },
+    {
+      "epoch": 0.7810666666666667,
+      "grad_norm": 0.312321732118843,
+      "learning_rate": 2.4109669694338632e-05,
+      "loss": 0.5466,
+      "step": 2929
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.3975393449590267,
+      "learning_rate": 2.405344792812847e-05,
+      "loss": 0.5452,
+      "step": 2930
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.3265068424958377,
+      "learning_rate": 2.3997282827738366e-05,
+      "loss": 0.5504,
+      "step": 2931
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.3452517475991024,
+      "learning_rate": 2.3941174435074654e-05,
+      "loss": 0.56,
+      "step": 2932
+    },
+    {
+      "epoch": 0.7821333333333333,
+      "grad_norm": 0.33782500403876303,
+      "learning_rate": 2.388512279200137e-05,
+      "loss": 0.6001,
+      "step": 2933
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.3370854004151688,
+      "learning_rate": 2.3829127940340168e-05,
+      "loss": 0.5563,
+      "step": 2934
+    },
+    {
+      "epoch": 0.7826666666666666,
+      "grad_norm": 0.33451810783263564,
+      "learning_rate": 2.3773189921870376e-05,
+      "loss": 0.5826,
+      "step": 2935
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.333585780842901,
+      "learning_rate": 2.3717308778328874e-05,
+      "loss": 0.5488,
+      "step": 2936
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.37089414369526985,
+      "learning_rate": 2.366148455141014e-05,
+      "loss": 0.593,
+      "step": 2937
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.37176979525298176,
+      "learning_rate": 2.360571728276617e-05,
+      "loss": 0.5659,
+      "step": 2938
+    },
+    {
+      "epoch": 0.7837333333333333,
+      "grad_norm": 0.34479276230369466,
+      "learning_rate": 2.355000701400647e-05,
+      "loss": 0.5139,
+      "step": 2939
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3251565084630508,
+      "learning_rate": 2.3494353786698e-05,
+      "loss": 0.5794,
+      "step": 2940
+    },
+    {
+      "epoch": 0.7842666666666667,
+      "grad_norm": 0.34830553401586795,
+      "learning_rate": 2.343875764236516e-05,
+      "loss": 0.6183,
+      "step": 2941
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.34745467283869724,
+      "learning_rate": 2.3383218622489787e-05,
+      "loss": 0.5944,
+      "step": 2942
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.3358164507661162,
+      "learning_rate": 2.3327736768511098e-05,
+      "loss": 0.6149,
+      "step": 2943
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.33637899727272086,
+      "learning_rate": 2.327231212182559e-05,
+      "loss": 0.5858,
+      "step": 2944
+    },
+    {
+      "epoch": 0.7853333333333333,
+      "grad_norm": 0.3821470768425901,
+      "learning_rate": 2.3216944723787138e-05,
+      "loss": 0.6128,
+      "step": 2945
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.3497881291500303,
+      "learning_rate": 2.3161634615706896e-05,
+      "loss": 0.5769,
+      "step": 2946
+    },
+    {
+      "epoch": 0.7858666666666667,
+      "grad_norm": 0.34137102419980003,
+      "learning_rate": 2.3106381838853253e-05,
+      "loss": 0.5735,
+      "step": 2947
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.34202923118737844,
+      "learning_rate": 2.3051186434451834e-05,
+      "loss": 0.5866,
+      "step": 2948
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.3629232532167221,
+      "learning_rate": 2.299604844368547e-05,
+      "loss": 0.6433,
+      "step": 2949
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.3545712525574004,
+      "learning_rate": 2.2940967907694112e-05,
+      "loss": 0.5332,
+      "step": 2950
+    },
+    {
+      "epoch": 0.7869333333333334,
+      "grad_norm": 0.3579347732370336,
+      "learning_rate": 2.2885944867574893e-05,
+      "loss": 0.5836,
+      "step": 2951
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3566100557157306,
+      "learning_rate": 2.2830979364382022e-05,
+      "loss": 0.6296,
+      "step": 2952
+    },
+    {
+      "epoch": 0.7874666666666666,
+      "grad_norm": 0.34699427364241003,
+      "learning_rate": 2.2776071439126757e-05,
+      "loss": 0.6025,
+      "step": 2953
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.3465231209142456,
+      "learning_rate": 2.272122113277744e-05,
+      "loss": 0.594,
+      "step": 2954
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.3670368858269258,
+      "learning_rate": 2.2666428486259382e-05,
+      "loss": 0.5329,
+      "step": 2955
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.32928263393429363,
+      "learning_rate": 2.2611693540454915e-05,
+      "loss": 0.5269,
+      "step": 2956
+    },
+    {
+      "epoch": 0.7885333333333333,
+      "grad_norm": 0.35814072641520267,
+      "learning_rate": 2.2557016336203262e-05,
+      "loss": 0.5712,
+      "step": 2957
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3331818850374455,
+      "learning_rate": 2.250239691430065e-05,
+      "loss": 0.5664,
+      "step": 2958
+    },
+    {
+      "epoch": 0.7890666666666667,
+      "grad_norm": 0.4036037057308087,
+      "learning_rate": 2.2447835315500065e-05,
+      "loss": 0.5958,
+      "step": 2959
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.3449568080902489,
+      "learning_rate": 2.239333158051147e-05,
+      "loss": 0.5638,
+      "step": 2960
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.3496415127045761,
+      "learning_rate": 2.2338885750001582e-05,
+      "loss": 0.6358,
+      "step": 2961
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.3501481408186804,
+      "learning_rate": 2.2284497864593944e-05,
+      "loss": 0.5972,
+      "step": 2962
+    },
+    {
+      "epoch": 0.7901333333333334,
+      "grad_norm": 0.33463969993135484,
+      "learning_rate": 2.2230167964868877e-05,
+      "loss": 0.5849,
+      "step": 2963
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.34851629426992603,
+      "learning_rate": 2.2175896091363414e-05,
+      "loss": 0.5715,
+      "step": 2964
+    },
+    {
+      "epoch": 0.7906666666666666,
+      "grad_norm": 0.36633196278627644,
+      "learning_rate": 2.212168228457129e-05,
+      "loss": 0.5733,
+      "step": 2965
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.35028472232384256,
+      "learning_rate": 2.2067526584942945e-05,
+      "loss": 0.6005,
+      "step": 2966
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.33031300973996097,
+      "learning_rate": 2.201342903288541e-05,
+      "loss": 0.5594,
+      "step": 2967
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.3601428153644341,
+      "learning_rate": 2.1959389668762377e-05,
+      "loss": 0.6124,
+      "step": 2968
+    },
+    {
+      "epoch": 0.7917333333333333,
+      "grad_norm": 0.3711474349441613,
+      "learning_rate": 2.19054085328941e-05,
+      "loss": 0.6243,
+      "step": 2969
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3414346088394343,
+      "learning_rate": 2.185148566555738e-05,
+      "loss": 0.5813,
+      "step": 2970
+    },
+    {
+      "epoch": 0.7922666666666667,
+      "grad_norm": 0.33354638034797607,
+      "learning_rate": 2.179762110698561e-05,
+      "loss": 0.5896,
+      "step": 2971
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.36501966154817117,
+      "learning_rate": 2.1743814897368597e-05,
+      "loss": 0.6131,
+      "step": 2972
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.3581623925022744,
+      "learning_rate": 2.1690067076852638e-05,
+      "loss": 0.5856,
+      "step": 2973
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.39654581731492705,
+      "learning_rate": 2.1636377685540487e-05,
+      "loss": 0.6308,
+      "step": 2974
+    },
+    {
+      "epoch": 0.7933333333333333,
+      "grad_norm": 0.3436151254193697,
+      "learning_rate": 2.1582746763491245e-05,
+      "loss": 0.5673,
+      "step": 2975
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.35044512361342534,
+      "learning_rate": 2.152917435072044e-05,
+      "loss": 0.574,
+      "step": 2976
+    },
+    {
+      "epoch": 0.7938666666666667,
+      "grad_norm": 0.3391148517034316,
+      "learning_rate": 2.1475660487199933e-05,
+      "loss": 0.5721,
+      "step": 2977
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.3462516980294206,
+      "learning_rate": 2.1422205212857892e-05,
+      "loss": 0.5762,
+      "step": 2978
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.32617739058869694,
+      "learning_rate": 2.136880856757877e-05,
+      "loss": 0.5532,
+      "step": 2979
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.3353370888440073,
+      "learning_rate": 2.131547059120329e-05,
+      "loss": 0.5525,
+      "step": 2980
+    },
+    {
+      "epoch": 0.7949333333333334,
+      "grad_norm": 0.35423787000879425,
+      "learning_rate": 2.1262191323528368e-05,
+      "loss": 0.5734,
+      "step": 2981
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.35902716894319314,
+      "learning_rate": 2.1208970804307194e-05,
+      "loss": 0.5975,
+      "step": 2982
+    },
+    {
+      "epoch": 0.7954666666666667,
+      "grad_norm": 0.32231984985583706,
+      "learning_rate": 2.115580907324899e-05,
+      "loss": 0.577,
+      "step": 2983
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.36113582567234853,
+      "learning_rate": 2.110270617001924e-05,
+      "loss": 0.6316,
+      "step": 2984
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.3800356684757358,
+      "learning_rate": 2.1049662134239457e-05,
+      "loss": 0.5898,
+      "step": 2985
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.3841705838592886,
+      "learning_rate": 2.0996677005487285e-05,
+      "loss": 0.5688,
+      "step": 2986
+    },
+    {
+      "epoch": 0.7965333333333333,
+      "grad_norm": 0.3372205092230364,
+      "learning_rate": 2.094375082329638e-05,
+      "loss": 0.6191,
+      "step": 2987
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3410194847506707,
+      "learning_rate": 2.0890883627156442e-05,
+      "loss": 0.502,
+      "step": 2988
+    },
+    {
+      "epoch": 0.7970666666666667,
+      "grad_norm": 0.37071420339438005,
+      "learning_rate": 2.0838075456513128e-05,
+      "loss": 0.5506,
+      "step": 2989
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.32982977459433915,
+      "learning_rate": 2.0785326350768087e-05,
+      "loss": 0.5405,
+      "step": 2990
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.3481684852598814,
+      "learning_rate": 2.0732636349278878e-05,
+      "loss": 0.5954,
+      "step": 2991
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.3285778523524745,
+      "learning_rate": 2.0680005491358967e-05,
+      "loss": 0.5085,
+      "step": 2992
+    },
+    {
+      "epoch": 0.7981333333333334,
+      "grad_norm": 0.3305428040598721,
+      "learning_rate": 2.0627433816277684e-05,
+      "loss": 0.5469,
+      "step": 2993
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.3524810485184063,
+      "learning_rate": 2.0574921363260226e-05,
+      "loss": 0.5656,
+      "step": 2994
+    },
+    {
+      "epoch": 0.7986666666666666,
+      "grad_norm": 0.3304612675564754,
+      "learning_rate": 2.0522468171487564e-05,
+      "loss": 0.5605,
+      "step": 2995
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.33560023661076915,
+      "learning_rate": 2.0470074280096484e-05,
+      "loss": 0.5665,
+      "step": 2996
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.3420571509222942,
+      "learning_rate": 2.041773972817954e-05,
+      "loss": 0.5548,
+      "step": 2997
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.36316095570694745,
+      "learning_rate": 2.0365464554784942e-05,
+      "loss": 0.5229,
+      "step": 2998
+    },
+    {
+      "epoch": 0.7997333333333333,
+      "grad_norm": 0.3831509790868416,
+      "learning_rate": 2.031324879891664e-05,
+      "loss": 0.6059,
+      "step": 2999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3468139295168862,
+      "learning_rate": 2.0261092499534285e-05,
+      "loss": 0.585,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8002666666666667,
+      "grad_norm": 0.3283080311199373,
+      "learning_rate": 2.020899569555311e-05,
+      "loss": 0.5392,
+      "step": 3001
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.34129602356471256,
+      "learning_rate": 2.0156958425843987e-05,
+      "loss": 0.5572,
+      "step": 3002
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.338405424984137,
+      "learning_rate": 2.0104980729233368e-05,
+      "loss": 0.6059,
+      "step": 3003
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.3543978323110512,
+      "learning_rate": 2.0053062644503228e-05,
+      "loss": 0.5315,
+      "step": 3004
+    },
+    {
+      "epoch": 0.8013333333333333,
+      "grad_norm": 0.3596869803007064,
+      "learning_rate": 2.000120421039111e-05,
+      "loss": 0.6161,
+      "step": 3005
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.3310055304869842,
+      "learning_rate": 1.994940546559e-05,
+      "loss": 0.527,
+      "step": 3006
+    },
+    {
+      "epoch": 0.8018666666666666,
+      "grad_norm": 0.3679947211080711,
+      "learning_rate": 1.9897666448748387e-05,
+      "loss": 0.5777,
+      "step": 3007
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.3365881418478606,
+      "learning_rate": 1.9845987198470174e-05,
+      "loss": 0.5343,
+      "step": 3008
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.3641471750427538,
+      "learning_rate": 1.979436775331468e-05,
+      "loss": 0.566,
+      "step": 3009
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.35879713886753284,
+      "learning_rate": 1.9742808151796587e-05,
+      "loss": 0.6098,
+      "step": 3010
+    },
+    {
+      "epoch": 0.8029333333333334,
+      "grad_norm": 0.3450226540112181,
+      "learning_rate": 1.9691308432385956e-05,
+      "loss": 0.5947,
+      "step": 3011
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3983602084144142,
+      "learning_rate": 1.963986863350814e-05,
+      "loss": 0.5772,
+      "step": 3012
+    },
+    {
+      "epoch": 0.8034666666666667,
+      "grad_norm": 0.3811759945389708,
+      "learning_rate": 1.9588488793543824e-05,
+      "loss": 0.5106,
+      "step": 3013
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.3206806892853088,
+      "learning_rate": 1.9537168950828875e-05,
+      "loss": 0.5384,
+      "step": 3014
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.3497959505990501,
+      "learning_rate": 1.9485909143654457e-05,
+      "loss": 0.5166,
+      "step": 3015
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.3437040874983381,
+      "learning_rate": 1.943470941026695e-05,
+      "loss": 0.5812,
+      "step": 3016
+    },
+    {
+      "epoch": 0.8045333333333333,
+      "grad_norm": 0.36824913600742964,
+      "learning_rate": 1.9383569788867873e-05,
+      "loss": 0.5694,
+      "step": 3017
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3623494389131158,
+      "learning_rate": 1.9332490317613904e-05,
+      "loss": 0.568,
+      "step": 3018
+    },
+    {
+      "epoch": 0.8050666666666667,
+      "grad_norm": 0.3424569572169921,
+      "learning_rate": 1.928147103461687e-05,
+      "loss": 0.565,
+      "step": 3019
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.3455387317412677,
+      "learning_rate": 1.9230511977943643e-05,
+      "loss": 0.5987,
+      "step": 3020
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.32797040924539606,
+      "learning_rate": 1.917961318561623e-05,
+      "loss": 0.5476,
+      "step": 3021
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.347167277213583,
+      "learning_rate": 1.9128774695611562e-05,
+      "loss": 0.5688,
+      "step": 3022
+    },
+    {
+      "epoch": 0.8061333333333334,
+      "grad_norm": 0.3498274783989771,
+      "learning_rate": 1.9077996545861677e-05,
+      "loss": 0.5418,
+      "step": 3023
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3325721718081461,
+      "learning_rate": 1.902727877425353e-05,
+      "loss": 0.5323,
+      "step": 3024
+    },
+    {
+      "epoch": 0.8066666666666666,
+      "grad_norm": 0.3582061127968383,
+      "learning_rate": 1.8976621418629047e-05,
+      "loss": 0.6011,
+      "step": 3025
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.35025751466836996,
+      "learning_rate": 1.8926024516785135e-05,
+      "loss": 0.564,
+      "step": 3026
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.3263800107252902,
+      "learning_rate": 1.8875488106473495e-05,
+      "loss": 0.5499,
+      "step": 3027
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.33237138627982893,
+      "learning_rate": 1.8825012225400752e-05,
+      "loss": 0.5275,
+      "step": 3028
+    },
+    {
+      "epoch": 0.8077333333333333,
+      "grad_norm": 0.3722351293796434,
+      "learning_rate": 1.8774596911228382e-05,
+      "loss": 0.5893,
+      "step": 3029
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4122538209596308,
+      "learning_rate": 1.8724242201572585e-05,
+      "loss": 0.5907,
+      "step": 3030
+    },
+    {
+      "epoch": 0.8082666666666667,
+      "grad_norm": 0.358682437899776,
+      "learning_rate": 1.8673948134004426e-05,
+      "loss": 0.5877,
+      "step": 3031
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.34031307693105944,
+      "learning_rate": 1.8623714746049704e-05,
+      "loss": 0.5761,
+      "step": 3032
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.3246738900676092,
+      "learning_rate": 1.8573542075188932e-05,
+      "loss": 0.5657,
+      "step": 3033
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.3433072148820531,
+      "learning_rate": 1.8523430158857334e-05,
+      "loss": 0.5646,
+      "step": 3034
+    },
+    {
+      "epoch": 0.8093333333333333,
+      "grad_norm": 0.3387495616516081,
+      "learning_rate": 1.8473379034444782e-05,
+      "loss": 0.5758,
+      "step": 3035
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.34529923052733885,
+      "learning_rate": 1.8423388739295833e-05,
+      "loss": 0.588,
+      "step": 3036
+    },
+    {
+      "epoch": 0.8098666666666666,
+      "grad_norm": 0.3466341485690237,
+      "learning_rate": 1.8373459310709585e-05,
+      "loss": 0.574,
+      "step": 3037
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.34573166808366856,
+      "learning_rate": 1.832359078593977e-05,
+      "loss": 0.5896,
+      "step": 3038
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.348255522853395,
+      "learning_rate": 1.8273783202194694e-05,
+      "loss": 0.5553,
+      "step": 3039
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.33829183891849923,
+      "learning_rate": 1.8224036596637152e-05,
+      "loss": 0.5427,
+      "step": 3040
+    },
+    {
+      "epoch": 0.8109333333333333,
+      "grad_norm": 0.3417128184368856,
+      "learning_rate": 1.8174351006384473e-05,
+      "loss": 0.5617,
+      "step": 3041
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.3597667027802103,
+      "learning_rate": 1.8124726468508435e-05,
+      "loss": 0.5516,
+      "step": 3042
+    },
+    {
+      "epoch": 0.8114666666666667,
+      "grad_norm": 0.31211275175926995,
+      "learning_rate": 1.8075163020035292e-05,
+      "loss": 0.5016,
+      "step": 3043
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.3674318252568378,
+      "learning_rate": 1.802566069794569e-05,
+      "loss": 0.5979,
+      "step": 3044
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.36490718474560413,
+      "learning_rate": 1.7976219539174687e-05,
+      "loss": 0.5629,
+      "step": 3045
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.3482359713706896,
+      "learning_rate": 1.79268395806117e-05,
+      "loss": 0.565,
+      "step": 3046
+    },
+    {
+      "epoch": 0.8125333333333333,
+      "grad_norm": 0.355305787392465,
+      "learning_rate": 1.787752085910046e-05,
+      "loss": 0.6048,
+      "step": 3047
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.37025822305093725,
+      "learning_rate": 1.782826341143904e-05,
+      "loss": 0.597,
+      "step": 3048
+    },
+    {
+      "epoch": 0.8130666666666667,
+      "grad_norm": 0.349070178251286,
+      "learning_rate": 1.777906727437979e-05,
+      "loss": 0.5988,
+      "step": 3049
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.3271595322401396,
+      "learning_rate": 1.7729932484629296e-05,
+      "loss": 0.5395,
+      "step": 3050
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.3321691289751727,
+      "learning_rate": 1.7680859078848376e-05,
+      "loss": 0.555,
+      "step": 3051
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.354746522010819,
+      "learning_rate": 1.7631847093652098e-05,
+      "loss": 0.6291,
+      "step": 3052
+    },
+    {
+      "epoch": 0.8141333333333334,
+      "grad_norm": 0.4154338098869775,
+      "learning_rate": 1.7582896565609598e-05,
+      "loss": 0.5762,
+      "step": 3053
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.343550579976311,
+      "learning_rate": 1.7534007531244236e-05,
+      "loss": 0.5542,
+      "step": 3054
+    },
+    {
+      "epoch": 0.8146666666666667,
+      "grad_norm": 0.34442071345670167,
+      "learning_rate": 1.7485180027033475e-05,
+      "loss": 0.5822,
+      "step": 3055
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.3454538987912449,
+      "learning_rate": 1.743641408940886e-05,
+      "loss": 0.6414,
+      "step": 3056
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.35737450665453296,
+      "learning_rate": 1.738770975475602e-05,
+      "loss": 0.5582,
+      "step": 3057
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.3406958021802569,
+      "learning_rate": 1.7339067059414582e-05,
+      "loss": 0.5394,
+      "step": 3058
+    },
+    {
+      "epoch": 0.8157333333333333,
+      "grad_norm": 0.3540868724474538,
+      "learning_rate": 1.7290486039678223e-05,
+      "loss": 0.5779,
+      "step": 3059
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3356834500889169,
+      "learning_rate": 1.7241966731794578e-05,
+      "loss": 0.5499,
+      "step": 3060
+    },
+    {
+      "epoch": 0.8162666666666667,
+      "grad_norm": 0.34861717312040424,
+      "learning_rate": 1.7193509171965237e-05,
+      "loss": 0.5576,
+      "step": 3061
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.31299222278930233,
+      "learning_rate": 1.7145113396345725e-05,
+      "loss": 0.5303,
+      "step": 3062
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.3464489477861352,
+      "learning_rate": 1.7096779441045473e-05,
+      "loss": 0.5702,
+      "step": 3063
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.34110702062407466,
+      "learning_rate": 1.7048507342127785e-05,
+      "loss": 0.5945,
+      "step": 3064
+    },
+    {
+      "epoch": 0.8173333333333334,
+      "grad_norm": 0.33369153933547685,
+      "learning_rate": 1.7000297135609787e-05,
+      "loss": 0.5718,
+      "step": 3065
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.3264318555296828,
+      "learning_rate": 1.695214885746246e-05,
+      "loss": 0.5202,
+      "step": 3066
+    },
+    {
+      "epoch": 0.8178666666666666,
+      "grad_norm": 0.3606842064423147,
+      "learning_rate": 1.6904062543610556e-05,
+      "loss": 0.5559,
+      "step": 3067
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.33856709350332637,
+      "learning_rate": 1.6856038229932636e-05,
+      "loss": 0.6019,
+      "step": 3068
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.33504660943360826,
+      "learning_rate": 1.6808075952260915e-05,
+      "loss": 0.5454,
+      "step": 3069
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.3286890067049924,
+      "learning_rate": 1.6760175746381402e-05,
+      "loss": 0.5569,
+      "step": 3070
+    },
+    {
+      "epoch": 0.8189333333333333,
+      "grad_norm": 0.35798914123973774,
+      "learning_rate": 1.6712337648033748e-05,
+      "loss": 0.585,
+      "step": 3071
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.33909398079926073,
+      "learning_rate": 1.6664561692911284e-05,
+      "loss": 0.516,
+      "step": 3072
+    },
+    {
+      "epoch": 0.8194666666666667,
+      "grad_norm": 0.3385164586449421,
+      "learning_rate": 1.6616847916660992e-05,
+      "loss": 0.5987,
+      "step": 3073
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.3308690950640168,
+      "learning_rate": 1.656919635488341e-05,
+      "loss": 0.5328,
+      "step": 3074
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.35880878727539284,
+      "learning_rate": 1.6521607043132714e-05,
+      "loss": 0.5534,
+      "step": 3075
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.34203552628716377,
+      "learning_rate": 1.647408001691657e-05,
+      "loss": 0.5585,
+      "step": 3076
+    },
+    {
+      "epoch": 0.8205333333333333,
+      "grad_norm": 0.34149842048265244,
+      "learning_rate": 1.6426615311696226e-05,
+      "loss": 0.5883,
+      "step": 3077
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.3280354312415396,
+      "learning_rate": 1.6379212962886394e-05,
+      "loss": 0.5225,
+      "step": 3078
+    },
+    {
+      "epoch": 0.8210666666666666,
+      "grad_norm": 0.34276041276084807,
+      "learning_rate": 1.633187300585528e-05,
+      "loss": 0.5613,
+      "step": 3079
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.37355750210778077,
+      "learning_rate": 1.6284595475924546e-05,
+      "loss": 0.5901,
+      "step": 3080
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.3451587348565714,
+      "learning_rate": 1.623738040836923e-05,
+      "loss": 0.5749,
+      "step": 3081
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.3430810898241322,
+      "learning_rate": 1.619022783841785e-05,
+      "loss": 0.5427,
+      "step": 3082
+    },
+    {
+      "epoch": 0.8221333333333334,
+      "grad_norm": 0.317669618085047,
+      "learning_rate": 1.614313780125224e-05,
+      "loss": 0.5284,
+      "step": 3083
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.35776214539323364,
+      "learning_rate": 1.609611033200752e-05,
+      "loss": 0.5722,
+      "step": 3084
+    },
+    {
+      "epoch": 0.8226666666666667,
+      "grad_norm": 0.33701294207980914,
+      "learning_rate": 1.6049145465772218e-05,
+      "loss": 0.5844,
+      "step": 3085
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.37266954199545144,
+      "learning_rate": 1.6002243237588112e-05,
+      "loss": 0.6142,
+      "step": 3086
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.36180311121312675,
+      "learning_rate": 1.5955403682450252e-05,
+      "loss": 0.6758,
+      "step": 3087
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.3291685064163525,
+      "learning_rate": 1.5908626835306938e-05,
+      "loss": 0.567,
+      "step": 3088
+    },
+    {
+      "epoch": 0.8237333333333333,
+      "grad_norm": 0.37160550945805754,
+      "learning_rate": 1.5861912731059636e-05,
+      "loss": 0.5546,
+      "step": 3089
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3482485423047536,
+      "learning_rate": 1.5815261404563065e-05,
+      "loss": 0.5991,
+      "step": 3090
+    },
+    {
+      "epoch": 0.8242666666666667,
+      "grad_norm": 0.36364254800404955,
+      "learning_rate": 1.5768672890625058e-05,
+      "loss": 0.5516,
+      "step": 3091
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.32491513929243804,
+      "learning_rate": 1.5722147224006565e-05,
+      "loss": 0.5893,
+      "step": 3092
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.35602654794975985,
+      "learning_rate": 1.5675684439421702e-05,
+      "loss": 0.5694,
+      "step": 3093
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.4167272938154404,
+      "learning_rate": 1.5629284571537618e-05,
+      "loss": 0.5651,
+      "step": 3094
+    },
+    {
+      "epoch": 0.8253333333333334,
+      "grad_norm": 0.34858193964545053,
+      "learning_rate": 1.5582947654974533e-05,
+      "loss": 0.5524,
+      "step": 3095
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3468123358100999,
+      "learning_rate": 1.5536673724305716e-05,
+      "loss": 0.563,
+      "step": 3096
+    },
+    {
+      "epoch": 0.8258666666666666,
+      "grad_norm": 0.34670637513527003,
+      "learning_rate": 1.5490462814057415e-05,
+      "loss": 0.5575,
+      "step": 3097
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.3184868533646417,
+      "learning_rate": 1.5444314958708873e-05,
+      "loss": 0.5443,
+      "step": 3098
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.32110216001052383,
+      "learning_rate": 1.5398230192692277e-05,
+      "loss": 0.5201,
+      "step": 3099
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.3462876778150794,
+      "learning_rate": 1.5352208550392743e-05,
+      "loss": 0.5931,
+      "step": 3100
+    },
+    {
+      "epoch": 0.8269333333333333,
+      "grad_norm": 0.3312189465926998,
+      "learning_rate": 1.5306250066148285e-05,
+      "loss": 0.5589,
+      "step": 3101
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.32952532243402916,
+      "learning_rate": 1.5260354774249806e-05,
+      "loss": 0.5665,
+      "step": 3102
+    },
+    {
+      "epoch": 0.8274666666666667,
+      "grad_norm": 0.3593564091826446,
+      "learning_rate": 1.5214522708941037e-05,
+      "loss": 0.5549,
+      "step": 3103
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.3609291592066639,
+      "learning_rate": 1.5168753904418565e-05,
+      "loss": 0.6086,
+      "step": 3104
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.342609030376073,
+      "learning_rate": 1.512304839483175e-05,
+      "loss": 0.5591,
+      "step": 3105
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.34918690776952505,
+      "learning_rate": 1.5077406214282741e-05,
+      "loss": 0.6121,
+      "step": 3106
+    },
+    {
+      "epoch": 0.8285333333333333,
+      "grad_norm": 0.36463903701050193,
+      "learning_rate": 1.5031827396826448e-05,
+      "loss": 0.5781,
+      "step": 3107
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.36561929772298407,
+      "learning_rate": 1.4986311976470425e-05,
+      "loss": 0.5783,
+      "step": 3108
+    },
+    {
+      "epoch": 0.8290666666666666,
+      "grad_norm": 0.33928718604895186,
+      "learning_rate": 1.4940859987175037e-05,
+      "loss": 0.56,
+      "step": 3109
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.34876984697828656,
+      "learning_rate": 1.489547146285325e-05,
+      "loss": 0.544,
+      "step": 3110
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.32426992058441134,
+      "learning_rate": 1.4850146437370693e-05,
+      "loss": 0.5392,
+      "step": 3111
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.3311752958283469,
+      "learning_rate": 1.4804884944545627e-05,
+      "loss": 0.5549,
+      "step": 3112
+    },
+    {
+      "epoch": 0.8301333333333333,
+      "grad_norm": 0.33526699517919134,
+      "learning_rate": 1.4759687018148894e-05,
+      "loss": 0.5834,
+      "step": 3113
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.34289504616230815,
+      "learning_rate": 1.471455269190396e-05,
+      "loss": 0.5532,
+      "step": 3114
+    },
+    {
+      "epoch": 0.8306666666666667,
+      "grad_norm": 0.3554178233628037,
+      "learning_rate": 1.466948199948669e-05,
+      "loss": 0.4929,
+      "step": 3115
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.35488489073814056,
+      "learning_rate": 1.462447497452567e-05,
+      "loss": 0.5835,
+      "step": 3116
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.3388100258171893,
+      "learning_rate": 1.4579531650601853e-05,
+      "loss": 0.5861,
+      "step": 3117
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.3252292502494152,
+      "learning_rate": 1.4534652061248677e-05,
+      "loss": 0.5506,
+      "step": 3118
+    },
+    {
+      "epoch": 0.8317333333333333,
+      "grad_norm": 0.332497505901698,
+      "learning_rate": 1.448983623995207e-05,
+      "loss": 0.5403,
+      "step": 3119
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3538993481602382,
+      "learning_rate": 1.444508422015034e-05,
+      "loss": 0.5781,
+      "step": 3120
+    },
+    {
+      "epoch": 0.8322666666666667,
+      "grad_norm": 0.3330130248940368,
+      "learning_rate": 1.4400396035234198e-05,
+      "loss": 0.5494,
+      "step": 3121
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.35722297271831394,
+      "learning_rate": 1.4355771718546773e-05,
+      "loss": 0.53,
+      "step": 3122
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.33146304474566984,
+      "learning_rate": 1.4311211303383442e-05,
+      "loss": 0.5723,
+      "step": 3123
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.34921100976654335,
+      "learning_rate": 1.4266714822991989e-05,
+      "loss": 0.5399,
+      "step": 3124
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.34187059882469273,
+      "learning_rate": 1.4222282310572465e-05,
+      "loss": 0.5828,
+      "step": 3125
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3508506691671756,
+      "learning_rate": 1.4177913799277198e-05,
+      "loss": 0.5611,
+      "step": 3126
+    },
+    {
+      "epoch": 0.8338666666666666,
+      "grad_norm": 0.34778415638058724,
+      "learning_rate": 1.4133609322210762e-05,
+      "loss": 0.4985,
+      "step": 3127
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.3249220339834162,
+      "learning_rate": 1.4089368912429945e-05,
+      "loss": 0.5186,
+      "step": 3128
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.35513881664832403,
+      "learning_rate": 1.4045192602943736e-05,
+      "loss": 0.5931,
+      "step": 3129
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.32643230736436546,
+      "learning_rate": 1.4001080426713331e-05,
+      "loss": 0.5443,
+      "step": 3130
+    },
+    {
+      "epoch": 0.8349333333333333,
+      "grad_norm": 0.3459611623277745,
+      "learning_rate": 1.3957032416651983e-05,
+      "loss": 0.5734,
+      "step": 3131
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.34682337803813185,
+      "learning_rate": 1.3913048605625168e-05,
+      "loss": 0.6112,
+      "step": 3132
+    },
+    {
+      "epoch": 0.8354666666666667,
+      "grad_norm": 0.3545936607288065,
+      "learning_rate": 1.3869129026450423e-05,
+      "loss": 0.6057,
+      "step": 3133
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.33269372663317176,
+      "learning_rate": 1.3825273711897347e-05,
+      "loss": 0.5584,
+      "step": 3134
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.3305187222747315,
+      "learning_rate": 1.3781482694687598e-05,
+      "loss": 0.5426,
+      "step": 3135
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.3333210069668065,
+      "learning_rate": 1.3737756007494861e-05,
+      "loss": 0.5235,
+      "step": 3136
+    },
+    {
+      "epoch": 0.8365333333333334,
+      "grad_norm": 0.3598372953989674,
+      "learning_rate": 1.3694093682944853e-05,
+      "loss": 0.5473,
+      "step": 3137
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3521392383401204,
+      "learning_rate": 1.3650495753615244e-05,
+      "loss": 0.5894,
+      "step": 3138
+    },
+    {
+      "epoch": 0.8370666666666666,
+      "grad_norm": 0.34076214918151726,
+      "learning_rate": 1.3606962252035615e-05,
+      "loss": 0.5638,
+      "step": 3139
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.3672019914215645,
+      "learning_rate": 1.3563493210687529e-05,
+      "loss": 0.5914,
+      "step": 3140
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.34307137890350625,
+      "learning_rate": 1.3520088662004438e-05,
+      "loss": 0.5526,
+      "step": 3141
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.32295057140138955,
+      "learning_rate": 1.3476748638371672e-05,
+      "loss": 0.5504,
+      "step": 3142
+    },
+    {
+      "epoch": 0.8381333333333333,
+      "grad_norm": 0.33721794235393426,
+      "learning_rate": 1.3433473172126431e-05,
+      "loss": 0.6044,
+      "step": 3143
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.33626786453773044,
+      "learning_rate": 1.3390262295557731e-05,
+      "loss": 0.5821,
+      "step": 3144
+    },
+    {
+      "epoch": 0.8386666666666667,
+      "grad_norm": 0.3619784898673849,
+      "learning_rate": 1.3347116040906394e-05,
+      "loss": 0.5621,
+      "step": 3145
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.3226946715887109,
+      "learning_rate": 1.3304034440365065e-05,
+      "loss": 0.5414,
+      "step": 3146
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.3381053345806699,
+      "learning_rate": 1.3261017526078057e-05,
+      "loss": 0.5414,
+      "step": 3147
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.3600032170298199,
+      "learning_rate": 1.3218065330141515e-05,
+      "loss": 0.5783,
+      "step": 3148
+    },
+    {
+      "epoch": 0.8397333333333333,
+      "grad_norm": 0.36358285934799534,
+      "learning_rate": 1.3175177884603252e-05,
+      "loss": 0.582,
+      "step": 3149
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.33877387393008823,
+      "learning_rate": 1.3132355221462778e-05,
+      "loss": 0.5736,
+      "step": 3150
+    },
+    {
+      "epoch": 0.8402666666666667,
+      "grad_norm": 0.34036159221005535,
+      "learning_rate": 1.3089597372671259e-05,
+      "loss": 0.532,
+      "step": 3151
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.32721215150383337,
+      "learning_rate": 1.3046904370131507e-05,
+      "loss": 0.5634,
+      "step": 3152
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.35704536091702926,
+      "learning_rate": 1.3004276245697955e-05,
+      "loss": 0.5458,
+      "step": 3153
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.3514385322166996,
+      "learning_rate": 1.2961713031176625e-05,
+      "loss": 0.5493,
+      "step": 3154
+    },
+    {
+      "epoch": 0.8413333333333334,
+      "grad_norm": 0.38800781784510435,
+      "learning_rate": 1.2919214758325104e-05,
+      "loss": 0.5645,
+      "step": 3155
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3366959556255024,
+      "learning_rate": 1.2876781458852538e-05,
+      "loss": 0.5916,
+      "step": 3156
+    },
+    {
+      "epoch": 0.8418666666666667,
+      "grad_norm": 0.33977567455723934,
+      "learning_rate": 1.2834413164419567e-05,
+      "loss": 0.5643,
+      "step": 3157
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.34141280336233987,
+      "learning_rate": 1.279210990663835e-05,
+      "loss": 0.5696,
+      "step": 3158
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.35339066196754426,
+      "learning_rate": 1.2749871717072515e-05,
+      "loss": 0.5755,
+      "step": 3159
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.32868857625428677,
+      "learning_rate": 1.2707698627237152e-05,
+      "loss": 0.5742,
+      "step": 3160
+    },
+    {
+      "epoch": 0.8429333333333333,
+      "grad_norm": 0.36450120748671516,
+      "learning_rate": 1.2665590668598781e-05,
+      "loss": 0.5495,
+      "step": 3161
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3459870745887521,
+      "learning_rate": 1.262354787257527e-05,
+      "loss": 0.5821,
+      "step": 3162
+    },
+    {
+      "epoch": 0.8434666666666667,
+      "grad_norm": 0.32624892871447686,
+      "learning_rate": 1.2581570270535924e-05,
+      "loss": 0.5407,
+      "step": 3163
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.32909254573737806,
+      "learning_rate": 1.2539657893801416e-05,
+      "loss": 0.5457,
+      "step": 3164
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.327849396654827,
+      "learning_rate": 1.2497810773643704e-05,
+      "loss": 0.5587,
+      "step": 3165
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.32594205283464733,
+      "learning_rate": 1.245602894128609e-05,
+      "loss": 0.5615,
+      "step": 3166
+    },
+    {
+      "epoch": 0.8445333333333334,
+      "grad_norm": 0.350382500476646,
+      "learning_rate": 1.2414312427903152e-05,
+      "loss": 0.6224,
+      "step": 3167
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.33571184143014543,
+      "learning_rate": 1.2372661264620744e-05,
+      "loss": 0.5729,
+      "step": 3168
+    },
+    {
+      "epoch": 0.8450666666666666,
+      "grad_norm": 0.325950602377954,
+      "learning_rate": 1.2331075482515942e-05,
+      "loss": 0.5496,
+      "step": 3169
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.34304794355718793,
+      "learning_rate": 1.2289555112617024e-05,
+      "loss": 0.5685,
+      "step": 3170
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.3423742386766403,
+      "learning_rate": 1.2248100185903488e-05,
+      "loss": 0.5635,
+      "step": 3171
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.3389473443874955,
+      "learning_rate": 1.2206710733306037e-05,
+      "loss": 0.5444,
+      "step": 3172
+    },
+    {
+      "epoch": 0.8461333333333333,
+      "grad_norm": 0.34594767698427004,
+      "learning_rate": 1.2165386785706456e-05,
+      "loss": 0.5874,
+      "step": 3173
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3705801891599974,
+      "learning_rate": 1.2124128373937693e-05,
+      "loss": 0.5543,
+      "step": 3174
+    },
+    {
+      "epoch": 0.8466666666666667,
+      "grad_norm": 0.35331391348330077,
+      "learning_rate": 1.208293552878379e-05,
+      "loss": 0.5895,
+      "step": 3175
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.32947477169748096,
+      "learning_rate": 1.204180828097986e-05,
+      "loss": 0.54,
+      "step": 3176
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.35936722997120535,
+      "learning_rate": 1.2000746661212104e-05,
+      "loss": 0.5686,
+      "step": 3177
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.3552943797490246,
+      "learning_rate": 1.1959750700117678e-05,
+      "loss": 0.6426,
+      "step": 3178
+    },
+    {
+      "epoch": 0.8477333333333333,
+      "grad_norm": 0.37181025722058303,
+      "learning_rate": 1.1918820428284839e-05,
+      "loss": 0.6171,
+      "step": 3179
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.34126260558190114,
+      "learning_rate": 1.1877955876252778e-05,
+      "loss": 0.5775,
+      "step": 3180
+    },
+    {
+      "epoch": 0.8482666666666666,
+      "grad_norm": 0.348535537282637,
+      "learning_rate": 1.1837157074511674e-05,
+      "loss": 0.5831,
+      "step": 3181
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.3335099676123647,
+      "learning_rate": 1.1796424053502641e-05,
+      "loss": 0.5215,
+      "step": 3182
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.322370643651146,
+      "learning_rate": 1.1755756843617705e-05,
+      "loss": 0.5369,
+      "step": 3183
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.3343027320626063,
+      "learning_rate": 1.1715155475199791e-05,
+      "loss": 0.5893,
+      "step": 3184
+    },
+    {
+      "epoch": 0.8493333333333334,
+      "grad_norm": 0.3398381708915825,
+      "learning_rate": 1.1674619978542734e-05,
+      "loss": 0.5598,
+      "step": 3185
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.3504585743785552,
+      "learning_rate": 1.1634150383891152e-05,
+      "loss": 0.5265,
+      "step": 3186
+    },
+    {
+      "epoch": 0.8498666666666667,
+      "grad_norm": 0.34953786866756165,
+      "learning_rate": 1.1593746721440524e-05,
+      "loss": 0.5738,
+      "step": 3187
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.3644657263291906,
+      "learning_rate": 1.1553409021337148e-05,
+      "loss": 0.5444,
+      "step": 3188
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.3800849986514281,
+      "learning_rate": 1.1513137313678113e-05,
+      "loss": 0.5533,
+      "step": 3189
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.35258202623158125,
+      "learning_rate": 1.147293162851123e-05,
+      "loss": 0.5148,
+      "step": 3190
+    },
+    {
+      "epoch": 0.8509333333333333,
+      "grad_norm": 0.33044009416194836,
+      "learning_rate": 1.143279199583508e-05,
+      "loss": 0.5482,
+      "step": 3191
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3530916489213741,
+      "learning_rate": 1.1392718445598949e-05,
+      "loss": 0.5825,
+      "step": 3192
+    },
+    {
+      "epoch": 0.8514666666666667,
+      "grad_norm": 0.3508665335390478,
+      "learning_rate": 1.1352711007702832e-05,
+      "loss": 0.5579,
+      "step": 3193
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.3466998416938436,
+      "learning_rate": 1.1312769711997362e-05,
+      "loss": 0.5432,
+      "step": 3194
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.3411622791986878,
+      "learning_rate": 1.1272894588283867e-05,
+      "loss": 0.5772,
+      "step": 3195
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.36269813098005116,
+      "learning_rate": 1.1233085666314258e-05,
+      "loss": 0.5779,
+      "step": 3196
+    },
+    {
+      "epoch": 0.8525333333333334,
+      "grad_norm": 0.3578020137917265,
+      "learning_rate": 1.1193342975791076e-05,
+      "loss": 0.5465,
+      "step": 3197
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3553736712478192,
+      "learning_rate": 1.1153666546367447e-05,
+      "loss": 0.5898,
+      "step": 3198
+    },
+    {
+      "epoch": 0.8530666666666666,
+      "grad_norm": 0.3277443264332803,
+      "learning_rate": 1.1114056407647044e-05,
+      "loss": 0.5554,
+      "step": 3199
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.3364657624990799,
+      "learning_rate": 1.1074512589184105e-05,
+      "loss": 0.5442,
+      "step": 3200
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.32918260501145835,
+      "learning_rate": 1.1035035120483328e-05,
+      "loss": 0.527,
+      "step": 3201
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.3295555408946516,
+      "learning_rate": 1.0995624030999974e-05,
+      "loss": 0.5934,
+      "step": 3202
+    },
+    {
+      "epoch": 0.8541333333333333,
+      "grad_norm": 0.35935900137617577,
+      "learning_rate": 1.095627935013972e-05,
+      "loss": 0.6097,
+      "step": 3203
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3401256039563951,
+      "learning_rate": 1.091700110725874e-05,
+      "loss": 0.5766,
+      "step": 3204
+    },
+    {
+      "epoch": 0.8546666666666667,
+      "grad_norm": 0.34740509250637935,
+      "learning_rate": 1.0877789331663612e-05,
+      "loss": 0.619,
+      "step": 3205
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.34838854795588037,
+      "learning_rate": 1.0838644052611314e-05,
+      "loss": 0.6076,
+      "step": 3206
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.34881942738001387,
+      "learning_rate": 1.0799565299309233e-05,
+      "loss": 0.5899,
+      "step": 3207
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.33809558751613183,
+      "learning_rate": 1.0760553100915093e-05,
+      "loss": 0.5903,
+      "step": 3208
+    },
+    {
+      "epoch": 0.8557333333333333,
+      "grad_norm": 0.3503737839708749,
+      "learning_rate": 1.0721607486536989e-05,
+      "loss": 0.5604,
+      "step": 3209
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3510536872947679,
+      "learning_rate": 1.0682728485233307e-05,
+      "loss": 0.5201,
+      "step": 3210
+    },
+    {
+      "epoch": 0.8562666666666666,
+      "grad_norm": 0.34266434087244585,
+      "learning_rate": 1.0643916126012755e-05,
+      "loss": 0.5804,
+      "step": 3211
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.35739511854891026,
+      "learning_rate": 1.060517043783429e-05,
+      "loss": 0.6024,
+      "step": 3212
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.33487402596688814,
+      "learning_rate": 1.0566491449607152e-05,
+      "loss": 0.5815,
+      "step": 3213
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.3337753554192245,
+      "learning_rate": 1.0527879190190793e-05,
+      "loss": 0.5135,
+      "step": 3214
+    },
+    {
+      "epoch": 0.8573333333333333,
+      "grad_norm": 0.34646176413437607,
+      "learning_rate": 1.0489333688394898e-05,
+      "loss": 0.563,
+      "step": 3215
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.35282410195835445,
+      "learning_rate": 1.0450854972979351e-05,
+      "loss": 0.5998,
+      "step": 3216
+    },
+    {
+      "epoch": 0.8578666666666667,
+      "grad_norm": 0.38319147117092245,
+      "learning_rate": 1.0412443072654132e-05,
+      "loss": 0.5739,
+      "step": 3217
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.32483721109258223,
+      "learning_rate": 1.0374098016079447e-05,
+      "loss": 0.5491,
+      "step": 3218
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.3608205263968364,
+      "learning_rate": 1.0335819831865601e-05,
+      "loss": 0.5895,
+      "step": 3219
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.33152552478422187,
+      "learning_rate": 1.0297608548573e-05,
+      "loss": 0.5543,
+      "step": 3220
+    },
+    {
+      "epoch": 0.8589333333333333,
+      "grad_norm": 0.3977908002600204,
+      "learning_rate": 1.0259464194712153e-05,
+      "loss": 0.582,
+      "step": 3221
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.35248138235149407,
+      "learning_rate": 1.0221386798743604e-05,
+      "loss": 0.5868,
+      "step": 3222
+    },
+    {
+      "epoch": 0.8594666666666667,
+      "grad_norm": 0.35197958026965886,
+      "learning_rate": 1.0183376389077948e-05,
+      "loss": 0.5722,
+      "step": 3223
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.34013751184786606,
+      "learning_rate": 1.0145432994075833e-05,
+      "loss": 0.5646,
+      "step": 3224
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.34994347886653926,
+      "learning_rate": 1.010755664204781e-05,
+      "loss": 0.5807,
+      "step": 3225
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.34550206003348827,
+      "learning_rate": 1.00697473612545e-05,
+      "loss": 0.5491,
+      "step": 3226
+    },
+    {
+      "epoch": 0.8605333333333334,
+      "grad_norm": 0.33916558306614775,
+      "learning_rate": 1.0032005179906478e-05,
+      "loss": 0.547,
+      "step": 3227
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3456662667542057,
+      "learning_rate": 9.994330126164208e-06,
+      "loss": 0.5791,
+      "step": 3228
+    },
+    {
+      "epoch": 0.8610666666666666,
+      "grad_norm": 0.3421016826860681,
+      "learning_rate": 9.956722228138083e-06,
+      "loss": 0.5647,
+      "step": 3229
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.35431801656339623,
+      "learning_rate": 9.919181513888409e-06,
+      "loss": 0.59,
+      "step": 3230
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.33044793847886744,
+      "learning_rate": 9.88170801142536e-06,
+      "loss": 0.5511,
+      "step": 3231
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.3696311999080211,
+      "learning_rate": 9.844301748708906e-06,
+      "loss": 0.5627,
+      "step": 3232
+    },
+    {
+      "epoch": 0.8621333333333333,
+      "grad_norm": 0.3315578626872966,
+      "learning_rate": 9.806962753648929e-06,
+      "loss": 0.5554,
+      "step": 3233
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.3272825363008379,
+      "learning_rate": 9.769691054105067e-06,
+      "loss": 0.5253,
+      "step": 3234
+    },
+    {
+      "epoch": 0.8626666666666667,
+      "grad_norm": 0.35656411744864613,
+      "learning_rate": 9.732486677886777e-06,
+      "loss": 0.5846,
+      "step": 3235
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.3433762363590593,
+      "learning_rate": 9.69534965275326e-06,
+      "loss": 0.5875,
+      "step": 3236
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.4143839401623141,
+      "learning_rate": 9.658280006413488e-06,
+      "loss": 0.601,
+      "step": 3237
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.3615864490544622,
+      "learning_rate": 9.621277766526138e-06,
+      "loss": 0.5731,
+      "step": 3238
+    },
+    {
+      "epoch": 0.8637333333333334,
+      "grad_norm": 0.3680894124977018,
+      "learning_rate": 9.584342960699633e-06,
+      "loss": 0.6064,
+      "step": 3239
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.37612550367805253,
+      "learning_rate": 9.547475616492007e-06,
+      "loss": 0.5787,
+      "step": 3240
+    },
+    {
+      "epoch": 0.8642666666666666,
+      "grad_norm": 0.3678757434369968,
+      "learning_rate": 9.510675761411015e-06,
+      "loss": 0.5789,
+      "step": 3241
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.3394426309988805,
+      "learning_rate": 9.473943422914067e-06,
+      "loss": 0.5397,
+      "step": 3242
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.369978562311377,
+      "learning_rate": 9.437278628408153e-06,
+      "loss": 0.5215,
+      "step": 3243
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.3349487137594821,
+      "learning_rate": 9.4006814052499e-06,
+      "loss": 0.5357,
+      "step": 3244
+    },
+    {
+      "epoch": 0.8653333333333333,
+      "grad_norm": 0.37528542341620424,
+      "learning_rate": 9.364151780745501e-06,
+      "loss": 0.5653,
+      "step": 3245
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.34429230510042047,
+      "learning_rate": 9.327689782150729e-06,
+      "loss": 0.5621,
+      "step": 3246
+    },
+    {
+      "epoch": 0.8658666666666667,
+      "grad_norm": 0.33909386793917956,
+      "learning_rate": 9.291295436670877e-06,
+      "loss": 0.545,
+      "step": 3247
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.3449815903588223,
+      "learning_rate": 9.25496877146077e-06,
+      "loss": 0.5679,
+      "step": 3248
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.34378056093082454,
+      "learning_rate": 9.218709813624748e-06,
+      "loss": 0.5891,
+      "step": 3249
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.35957566797279594,
+      "learning_rate": 9.182518590216615e-06,
+      "loss": 0.5928,
+      "step": 3250
+    },
+    {
+      "epoch": 0.8669333333333333,
+      "grad_norm": 0.328655898207611,
+      "learning_rate": 9.146395128239637e-06,
+      "loss": 0.5625,
+      "step": 3251
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3518279321720355,
+      "learning_rate": 9.110339454646532e-06,
+      "loss": 0.6103,
+      "step": 3252
+    },
+    {
+      "epoch": 0.8674666666666667,
+      "grad_norm": 0.557246452495665,
+      "learning_rate": 9.074351596339437e-06,
+      "loss": 0.5913,
+      "step": 3253
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.40350832276323767,
+      "learning_rate": 9.03843158016987e-06,
+      "loss": 0.5946,
+      "step": 3254
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.33970570901158936,
+      "learning_rate": 9.002579432938795e-06,
+      "loss": 0.5538,
+      "step": 3255
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.3461920017211161,
+      "learning_rate": 8.966795181396425e-06,
+      "loss": 0.5388,
+      "step": 3256
+    },
+    {
+      "epoch": 0.8685333333333334,
+      "grad_norm": 0.34190066830255655,
+      "learning_rate": 8.931078852242413e-06,
+      "loss": 0.5864,
+      "step": 3257
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3356230809666767,
+      "learning_rate": 8.895430472125687e-06,
+      "loss": 0.5621,
+      "step": 3258
+    },
+    {
+      "epoch": 0.8690666666666667,
+      "grad_norm": 0.345759208794225,
+      "learning_rate": 8.859850067644505e-06,
+      "loss": 0.6045,
+      "step": 3259
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.33861681655206805,
+      "learning_rate": 8.824337665346371e-06,
+      "loss": 0.5832,
+      "step": 3260
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.362354139084554,
+      "learning_rate": 8.788893291728083e-06,
+      "loss": 0.5989,
+      "step": 3261
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.3530734766136594,
+      "learning_rate": 8.753516973235654e-06,
+      "loss": 0.5652,
+      "step": 3262
+    },
+    {
+      "epoch": 0.8701333333333333,
+      "grad_norm": 0.44759324956005603,
+      "learning_rate": 8.718208736264344e-06,
+      "loss": 0.5075,
+      "step": 3263
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.33063703542549155,
+      "learning_rate": 8.682968607158604e-06,
+      "loss": 0.5579,
+      "step": 3264
+    },
+    {
+      "epoch": 0.8706666666666667,
+      "grad_norm": 0.37433577562887554,
+      "learning_rate": 8.647796612212056e-06,
+      "loss": 0.6016,
+      "step": 3265
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.3337620533484374,
+      "learning_rate": 8.612692777667498e-06,
+      "loss": 0.5545,
+      "step": 3266
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.31873766786679025,
+      "learning_rate": 8.577657129716887e-06,
+      "loss": 0.5149,
+      "step": 3267
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.33669869623011145,
+      "learning_rate": 8.542689694501272e-06,
+      "loss": 0.5696,
+      "step": 3268
+    },
+    {
+      "epoch": 0.8717333333333334,
+      "grad_norm": 0.3644123820171066,
+      "learning_rate": 8.507790498110824e-06,
+      "loss": 0.5294,
+      "step": 3269
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.32859115143557605,
+      "learning_rate": 8.472959566584804e-06,
+      "loss": 0.5524,
+      "step": 3270
+    },
+    {
+      "epoch": 0.8722666666666666,
+      "grad_norm": 0.3649837198560756,
+      "learning_rate": 8.438196925911546e-06,
+      "loss": 0.6044,
+      "step": 3271
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.353299286740774,
+      "learning_rate": 8.40350260202838e-06,
+      "loss": 0.5793,
+      "step": 3272
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.35775039633801486,
+      "learning_rate": 8.36887662082172e-06,
+      "loss": 0.5644,
+      "step": 3273
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.3461767398297513,
+      "learning_rate": 8.334319008126967e-06,
+      "loss": 0.5772,
+      "step": 3274
+    },
+    {
+      "epoch": 0.8733333333333333,
+      "grad_norm": 0.33164862832406716,
+      "learning_rate": 8.299829789728498e-06,
+      "loss": 0.5075,
+      "step": 3275
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.36987386787045756,
+      "learning_rate": 8.265408991359691e-06,
+      "loss": 0.5705,
+      "step": 3276
+    },
+    {
+      "epoch": 0.8738666666666667,
+      "grad_norm": 0.35301635551176375,
+      "learning_rate": 8.231056638702839e-06,
+      "loss": 0.6106,
+      "step": 3277
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.3462908486757754,
+      "learning_rate": 8.196772757389203e-06,
+      "loss": 0.5562,
+      "step": 3278
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.3328326226517987,
+      "learning_rate": 8.162557372998913e-06,
+      "loss": 0.591,
+      "step": 3279
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.3388622961992753,
+      "learning_rate": 8.128410511061002e-06,
+      "loss": 0.5528,
+      "step": 3280
+    },
+    {
+      "epoch": 0.8749333333333333,
+      "grad_norm": 0.37635543294128826,
+      "learning_rate": 8.094332197053412e-06,
+      "loss": 0.6112,
+      "step": 3281
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3799775151648602,
+      "learning_rate": 8.06032245640288e-06,
+      "loss": 0.6067,
+      "step": 3282
+    },
+    {
+      "epoch": 0.8754666666666666,
+      "grad_norm": 0.35906077347323107,
+      "learning_rate": 8.026381314485054e-06,
+      "loss": 0.5215,
+      "step": 3283
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.3287536923184325,
+      "learning_rate": 7.992508796624343e-06,
+      "loss": 0.5261,
+      "step": 3284
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.327492567496066,
+      "learning_rate": 7.958704928093963e-06,
+      "loss": 0.4939,
+      "step": 3285
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.36053509661078054,
+      "learning_rate": 7.924969734115928e-06,
+      "loss": 0.5629,
+      "step": 3286
+    },
+    {
+      "epoch": 0.8765333333333334,
+      "grad_norm": 0.3363783491727698,
+      "learning_rate": 7.89130323986098e-06,
+      "loss": 0.5794,
+      "step": 3287
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.34796371043350693,
+      "learning_rate": 7.857705470448617e-06,
+      "loss": 0.5444,
+      "step": 3288
+    },
+    {
+      "epoch": 0.8770666666666667,
+      "grad_norm": 0.3184679776396525,
+      "learning_rate": 7.824176450947075e-06,
+      "loss": 0.5546,
+      "step": 3289
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.34421400841504457,
+      "learning_rate": 7.790716206373283e-06,
+      "loss": 0.5599,
+      "step": 3290
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.3532589077353095,
+      "learning_rate": 7.757324761692841e-06,
+      "loss": 0.6085,
+      "step": 3291
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.3448776749614105,
+      "learning_rate": 7.72400214182004e-06,
+      "loss": 0.5196,
+      "step": 3292
+    },
+    {
+      "epoch": 0.8781333333333333,
+      "grad_norm": 0.3350082273654233,
+      "learning_rate": 7.690748371617806e-06,
+      "loss": 0.5373,
+      "step": 3293
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.34487808315604784,
+      "learning_rate": 7.657563475897711e-06,
+      "loss": 0.575,
+      "step": 3294
+    },
+    {
+      "epoch": 0.8786666666666667,
+      "grad_norm": 0.35237549556478437,
+      "learning_rate": 7.624447479419883e-06,
+      "loss": 0.6269,
+      "step": 3295
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.33164028359496506,
+      "learning_rate": 7.591400406893101e-06,
+      "loss": 0.5488,
+      "step": 3296
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.3542185727173985,
+      "learning_rate": 7.558422282974708e-06,
+      "loss": 0.5764,
+      "step": 3297
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.3611010524609047,
+      "learning_rate": 7.525513132270579e-06,
+      "loss": 0.5811,
+      "step": 3298
+    },
+    {
+      "epoch": 0.8797333333333334,
+      "grad_norm": 0.36001884298028736,
+      "learning_rate": 7.492672979335147e-06,
+      "loss": 0.6253,
+      "step": 3299
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.36426856748370834,
+      "learning_rate": 7.459901848671347e-06,
+      "loss": 0.5798,
+      "step": 3300
+    },
+    {
+      "epoch": 0.8802666666666666,
+      "grad_norm": 0.35830557996569656,
+      "learning_rate": 7.4271997647306415e-06,
+      "loss": 0.5523,
+      "step": 3301
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.34978906934982534,
+      "learning_rate": 7.394566751912957e-06,
+      "loss": 0.5263,
+      "step": 3302
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.35198915069184167,
+      "learning_rate": 7.3620028345666726e-06,
+      "loss": 0.5833,
+      "step": 3303
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.35568275541387856,
+      "learning_rate": 7.329508036988641e-06,
+      "loss": 0.5811,
+      "step": 3304
+    },
+    {
+      "epoch": 0.8813333333333333,
+      "grad_norm": 0.35916953034639415,
+      "learning_rate": 7.297082383424115e-06,
+      "loss": 0.643,
+      "step": 3305
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.34236865114061354,
+      "learning_rate": 7.2647258980667706e-06,
+      "loss": 0.5967,
+      "step": 3306
+    },
+    {
+      "epoch": 0.8818666666666667,
+      "grad_norm": 0.34567493944293115,
+      "learning_rate": 7.232438605058689e-06,
+      "loss": 0.5553,
+      "step": 3307
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 1.3100793124503591,
+      "learning_rate": 7.200220528490298e-06,
+      "loss": 0.591,
+      "step": 3308
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.3639618549404457,
+      "learning_rate": 7.168071692400402e-06,
+      "loss": 0.549,
+      "step": 3309
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.33472015991256804,
+      "learning_rate": 7.1359921207761585e-06,
+      "loss": 0.5326,
+      "step": 3310
+    },
+    {
+      "epoch": 0.8829333333333333,
+      "grad_norm": 0.34037985197760295,
+      "learning_rate": 7.1039818375529644e-06,
+      "loss": 0.5831,
+      "step": 3311
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.33021966945560177,
+      "learning_rate": 7.072040866614616e-06,
+      "loss": 0.5317,
+      "step": 3312
+    },
+    {
+      "epoch": 0.8834666666666666,
+      "grad_norm": 0.331136993799541,
+      "learning_rate": 7.040169231793137e-06,
+      "loss": 0.5506,
+      "step": 3313
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.3486234207619461,
+      "learning_rate": 7.0083669568688505e-06,
+      "loss": 0.6035,
+      "step": 3314
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.3638037317700521,
+      "learning_rate": 6.976634065570309e-06,
+      "loss": 0.5656,
+      "step": 3315
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.3421952045586565,
+      "learning_rate": 6.944970581574284e-06,
+      "loss": 0.5403,
+      "step": 3316
+    },
+    {
+      "epoch": 0.8845333333333333,
+      "grad_norm": 0.33440102904543934,
+      "learning_rate": 6.913376528505799e-06,
+      "loss": 0.5521,
+      "step": 3317
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3765112715460031,
+      "learning_rate": 6.881851929938021e-06,
+      "loss": 0.5992,
+      "step": 3318
+    },
+    {
+      "epoch": 0.8850666666666667,
+      "grad_norm": 0.3630694747416555,
+      "learning_rate": 6.850396809392356e-06,
+      "loss": 0.5965,
+      "step": 3319
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.3522758941752851,
+      "learning_rate": 6.819011190338309e-06,
+      "loss": 0.5636,
+      "step": 3320
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.3385246380739454,
+      "learning_rate": 6.78769509619358e-06,
+      "loss": 0.5607,
+      "step": 3321
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.32551809370357543,
+      "learning_rate": 6.7564485503239574e-06,
+      "loss": 0.5295,
+      "step": 3322
+    },
+    {
+      "epoch": 0.8861333333333333,
+      "grad_norm": 0.35627144264962807,
+      "learning_rate": 6.725271576043346e-06,
+      "loss": 0.5737,
+      "step": 3323
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.36115897784736484,
+      "learning_rate": 6.694164196613772e-06,
+      "loss": 0.5558,
+      "step": 3324
+    },
+    {
+      "epoch": 0.8866666666666667,
+      "grad_norm": 0.3341930188039213,
+      "learning_rate": 6.663126435245304e-06,
+      "loss": 0.5433,
+      "step": 3325
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.3810960423378213,
+      "learning_rate": 6.63215831509606e-06,
+      "loss": 0.5879,
+      "step": 3326
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.33993605890660894,
+      "learning_rate": 6.601259859272202e-06,
+      "loss": 0.586,
+      "step": 3327
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.35004036929817195,
+      "learning_rate": 6.570431090827944e-06,
+      "loss": 0.6266,
+      "step": 3328
+    },
+    {
+      "epoch": 0.8877333333333334,
+      "grad_norm": 0.3717503938383483,
+      "learning_rate": 6.539672032765465e-06,
+      "loss": 0.6555,
+      "step": 3329
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3445225343826448,
+      "learning_rate": 6.508982708034961e-06,
+      "loss": 0.5866,
+      "step": 3330
+    },
+    {
+      "epoch": 0.8882666666666666,
+      "grad_norm": 0.35045801232569795,
+      "learning_rate": 6.478363139534571e-06,
+      "loss": 0.5893,
+      "step": 3331
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.35154649913330455,
+      "learning_rate": 6.44781335011041e-06,
+      "loss": 0.5901,
+      "step": 3332
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.35292744534969683,
+      "learning_rate": 6.417333362556532e-06,
+      "loss": 0.5655,
+      "step": 3333
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.3456544766542196,
+      "learning_rate": 6.3869231996148695e-06,
+      "loss": 0.5373,
+      "step": 3334
+    },
+    {
+      "epoch": 0.8893333333333333,
+      "grad_norm": 0.3524234625854795,
+      "learning_rate": 6.3565828839753035e-06,
+      "loss": 0.5855,
+      "step": 3335
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.39272882262439995,
+      "learning_rate": 6.326312438275572e-06,
+      "loss": 0.6213,
+      "step": 3336
+    },
+    {
+      "epoch": 0.8898666666666667,
+      "grad_norm": 0.3448382217704263,
+      "learning_rate": 6.296111885101297e-06,
+      "loss": 0.5956,
+      "step": 3337
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.3513853832014005,
+      "learning_rate": 6.265981246985919e-06,
+      "loss": 0.5351,
+      "step": 3338
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.34568456451112917,
+      "learning_rate": 6.2359205464107895e-06,
+      "loss": 0.5505,
+      "step": 3339
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.48686243022011005,
+      "learning_rate": 6.2059298058049995e-06,
+      "loss": 0.5287,
+      "step": 3340
+    },
+    {
+      "epoch": 0.8909333333333334,
+      "grad_norm": 0.3460418659531681,
+      "learning_rate": 6.1760090475454834e-06,
+      "loss": 0.5458,
+      "step": 3341
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.36103118976956017,
+      "learning_rate": 6.146158293956927e-06,
+      "loss": 0.5813,
+      "step": 3342
+    },
+    {
+      "epoch": 0.8914666666666666,
+      "grad_norm": 0.3461758153050274,
+      "learning_rate": 6.116377567311793e-06,
+      "loss": 0.5976,
+      "step": 3343
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.3581011010382332,
+      "learning_rate": 6.086666889830328e-06,
+      "loss": 0.5854,
+      "step": 3344
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.33804936136371316,
+      "learning_rate": 6.057026283680478e-06,
+      "loss": 0.5853,
+      "step": 3345
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.3340592069086362,
+      "learning_rate": 6.02745577097793e-06,
+      "loss": 0.5712,
+      "step": 3346
+    },
+    {
+      "epoch": 0.8925333333333333,
+      "grad_norm": 0.34941985204684334,
+      "learning_rate": 5.997955373786035e-06,
+      "loss": 0.5976,
+      "step": 3347
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3643799347137535,
+      "learning_rate": 5.968525114115875e-06,
+      "loss": 0.5341,
+      "step": 3348
+    },
+    {
+      "epoch": 0.8930666666666667,
+      "grad_norm": 0.32646250831254653,
+      "learning_rate": 5.939165013926196e-06,
+      "loss": 0.5547,
+      "step": 3349
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.3454219699314769,
+      "learning_rate": 5.90987509512333e-06,
+      "loss": 0.5632,
+      "step": 3350
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.33942577975764904,
+      "learning_rate": 5.880655379561328e-06,
+      "loss": 0.5884,
+      "step": 3351
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.36689672439472804,
+      "learning_rate": 5.851505889041819e-06,
+      "loss": 0.5489,
+      "step": 3352
+    },
+    {
+      "epoch": 0.8941333333333333,
+      "grad_norm": 0.3383970330327614,
+      "learning_rate": 5.82242664531405e-06,
+      "loss": 0.5603,
+      "step": 3353
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.3449391882849117,
+      "learning_rate": 5.793417670074841e-06,
+      "loss": 0.5395,
+      "step": 3354
+    },
+    {
+      "epoch": 0.8946666666666667,
+      "grad_norm": 0.3655048693945037,
+      "learning_rate": 5.764478984968591e-06,
+      "loss": 0.6243,
+      "step": 3355
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.3619076553017383,
+      "learning_rate": 5.73561061158725e-06,
+      "loss": 0.5677,
+      "step": 3356
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.35851992046288833,
+      "learning_rate": 5.70681257147031e-06,
+      "loss": 0.5882,
+      "step": 3357
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.35672187751638573,
+      "learning_rate": 5.678084886104795e-06,
+      "loss": 0.5336,
+      "step": 3358
+    },
+    {
+      "epoch": 0.8957333333333334,
+      "grad_norm": 0.3324540489987115,
+      "learning_rate": 5.649427576925204e-06,
+      "loss": 0.5633,
+      "step": 3359
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.31751610760453475,
+      "learning_rate": 5.620840665313554e-06,
+      "loss": 0.5377,
+      "step": 3360
+    },
+    {
+      "epoch": 0.8962666666666667,
+      "grad_norm": 0.3369519572095269,
+      "learning_rate": 5.59232417259935e-06,
+      "loss": 0.542,
+      "step": 3361
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.32943862031547017,
+      "learning_rate": 5.563878120059507e-06,
+      "loss": 0.5257,
+      "step": 3362
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.3545512427860248,
+      "learning_rate": 5.535502528918413e-06,
+      "loss": 0.5273,
+      "step": 3363
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.34791790302858805,
+      "learning_rate": 5.507197420347909e-06,
+      "loss": 0.5435,
+      "step": 3364
+    },
+    {
+      "epoch": 0.8973333333333333,
+      "grad_norm": 0.3672933208616374,
+      "learning_rate": 5.478962815467193e-06,
+      "loss": 0.5796,
+      "step": 3365
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.357506143076227,
+      "learning_rate": 5.450798735342877e-06,
+      "loss": 0.5818,
+      "step": 3366
+    },
+    {
+      "epoch": 0.8978666666666667,
+      "grad_norm": 0.3431617037447951,
+      "learning_rate": 5.422705200988975e-06,
+      "loss": 0.5708,
+      "step": 3367
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.3435081114997157,
+      "learning_rate": 5.394682233366844e-06,
+      "loss": 0.5992,
+      "step": 3368
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.3449471231427674,
+      "learning_rate": 5.366729853385189e-06,
+      "loss": 0.5831,
+      "step": 3369
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.3515242856874825,
+      "learning_rate": 5.3388480819000604e-06,
+      "loss": 0.5426,
+      "step": 3370
+    },
+    {
+      "epoch": 0.8989333333333334,
+      "grad_norm": 0.3278263290498706,
+      "learning_rate": 5.3110369397148195e-06,
+      "loss": 0.5698,
+      "step": 3371
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4205046184387036,
+      "learning_rate": 5.283296447580121e-06,
+      "loss": 0.5119,
+      "step": 3372
+    },
+    {
+      "epoch": 0.8994666666666666,
+      "grad_norm": 0.33514078064743896,
+      "learning_rate": 5.25562662619391e-06,
+      "loss": 0.5361,
+      "step": 3373
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.3651376001740104,
+      "learning_rate": 5.2280274962014155e-06,
+      "loss": 0.5825,
+      "step": 3374
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.34651899022009525,
+      "learning_rate": 5.200499078195109e-06,
+      "loss": 0.5548,
+      "step": 3375
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.3492079567788359,
+      "learning_rate": 5.173041392714695e-06,
+      "loss": 0.5131,
+      "step": 3376
+    },
+    {
+      "epoch": 0.9005333333333333,
+      "grad_norm": 0.3324962445554178,
+      "learning_rate": 5.145654460247107e-06,
+      "loss": 0.5635,
+      "step": 3377
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.3629628773480166,
+      "learning_rate": 5.118338301226511e-06,
+      "loss": 0.5467,
+      "step": 3378
+    },
+    {
+      "epoch": 0.9010666666666667,
+      "grad_norm": 0.3646730658641418,
+      "learning_rate": 5.091092936034225e-06,
+      "loss": 0.6,
+      "step": 3379
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.3283987708264201,
+      "learning_rate": 5.063918384998801e-06,
+      "loss": 0.5338,
+      "step": 3380
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.3581215745826271,
+      "learning_rate": 5.036814668395884e-06,
+      "loss": 0.5509,
+      "step": 3381
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.35746200480286305,
+      "learning_rate": 5.009781806448321e-06,
+      "loss": 0.5639,
+      "step": 3382
+    },
+    {
+      "epoch": 0.9021333333333333,
+      "grad_norm": 0.35833296131263564,
+      "learning_rate": 4.982819819326079e-06,
+      "loss": 0.5061,
+      "step": 3383
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.34649362525522315,
+      "learning_rate": 4.955928727146242e-06,
+      "loss": 0.5869,
+      "step": 3384
+    },
+    {
+      "epoch": 0.9026666666666666,
+      "grad_norm": 0.32997879256129825,
+      "learning_rate": 4.929108549972994e-06,
+      "loss": 0.531,
+      "step": 3385
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.3371744799059837,
+      "learning_rate": 4.902359307817617e-06,
+      "loss": 0.5512,
+      "step": 3386
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.3588750402700034,
+      "learning_rate": 4.875681020638445e-06,
+      "loss": 0.5737,
+      "step": 3387
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.3356688343085048,
+      "learning_rate": 4.849073708340912e-06,
+      "loss": 0.5524,
+      "step": 3388
+    },
+    {
+      "epoch": 0.9037333333333334,
+      "grad_norm": 0.6348790138534877,
+      "learning_rate": 4.822537390777438e-06,
+      "loss": 0.5448,
+      "step": 3389
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.33941149381075175,
+      "learning_rate": 4.7960720877475055e-06,
+      "loss": 0.5512,
+      "step": 3390
+    },
+    {
+      "epoch": 0.9042666666666667,
+      "grad_norm": 0.3507717175658764,
+      "learning_rate": 4.76967781899762e-06,
+      "loss": 0.6007,
+      "step": 3391
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.334190670152143,
+      "learning_rate": 4.743354604221273e-06,
+      "loss": 0.5823,
+      "step": 3392
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.3720034306023298,
+      "learning_rate": 4.717102463058931e-06,
+      "loss": 0.5726,
+      "step": 3393
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.33913944176271804,
+      "learning_rate": 4.69092141509806e-06,
+      "loss": 0.5291,
+      "step": 3394
+    },
+    {
+      "epoch": 0.9053333333333333,
+      "grad_norm": 0.34026829187386864,
+      "learning_rate": 4.664811479873066e-06,
+      "loss": 0.5714,
+      "step": 3395
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.35689347000342464,
+      "learning_rate": 4.638772676865266e-06,
+      "loss": 0.5847,
+      "step": 3396
+    },
+    {
+      "epoch": 0.9058666666666667,
+      "grad_norm": 0.33553290888199866,
+      "learning_rate": 4.612805025502953e-06,
+      "loss": 0.5218,
+      "step": 3397
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.33773884578643815,
+      "learning_rate": 4.586908545161295e-06,
+      "loss": 0.5765,
+      "step": 3398
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.3350389585714146,
+      "learning_rate": 4.56108325516238e-06,
+      "loss": 0.5661,
+      "step": 3399
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.33419233333684584,
+      "learning_rate": 4.53532917477516e-06,
+      "loss": 0.5645,
+      "step": 3400
+    },
+    {
+      "epoch": 0.9069333333333334,
+      "grad_norm": 0.33021951009146866,
+      "learning_rate": 4.509646323215477e-06,
+      "loss": 0.5626,
+      "step": 3401
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3390888722057864,
+      "learning_rate": 4.484034719646013e-06,
+      "loss": 0.5613,
+      "step": 3402
+    },
+    {
+      "epoch": 0.9074666666666666,
+      "grad_norm": 0.34782883901387635,
+      "learning_rate": 4.458494383176292e-06,
+      "loss": 0.5736,
+      "step": 3403
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.4285254476188629,
+      "learning_rate": 4.433025332862661e-06,
+      "loss": 0.5891,
+      "step": 3404
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.35685185292864363,
+      "learning_rate": 4.407627587708285e-06,
+      "loss": 0.5484,
+      "step": 3405
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.33162199278586796,
+      "learning_rate": 4.3823011666631274e-06,
+      "loss": 0.5282,
+      "step": 3406
+    },
+    {
+      "epoch": 0.9085333333333333,
+      "grad_norm": 0.3353837709097986,
+      "learning_rate": 4.357046088623917e-06,
+      "loss": 0.5775,
+      "step": 3407
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3422129663229781,
+      "learning_rate": 4.331862372434181e-06,
+      "loss": 0.5558,
+      "step": 3408
+    },
+    {
+      "epoch": 0.9090666666666667,
+      "grad_norm": 0.34790986516491507,
+      "learning_rate": 4.3067500368841665e-06,
+      "loss": 0.58,
+      "step": 3409
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.3601399418613975,
+      "learning_rate": 4.281709100710907e-06,
+      "loss": 0.5967,
+      "step": 3410
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.34061847847607335,
+      "learning_rate": 4.256739582598113e-06,
+      "loss": 0.5325,
+      "step": 3411
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.3474707757087644,
+      "learning_rate": 4.231841501176237e-06,
+      "loss": 0.5569,
+      "step": 3412
+    },
+    {
+      "epoch": 0.9101333333333333,
+      "grad_norm": 0.33413575214733837,
+      "learning_rate": 4.207014875022442e-06,
+      "loss": 0.5739,
+      "step": 3413
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.37447508497973897,
+      "learning_rate": 4.182259722660531e-06,
+      "loss": 0.6287,
+      "step": 3414
+    },
+    {
+      "epoch": 0.9106666666666666,
+      "grad_norm": 0.34966345445978925,
+      "learning_rate": 4.15757606256103e-06,
+      "loss": 0.5605,
+      "step": 3415
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.3569638808577305,
+      "learning_rate": 4.132963913141097e-06,
+      "loss": 0.5752,
+      "step": 3416
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.3828216812747884,
+      "learning_rate": 4.108423292764529e-06,
+      "loss": 0.5446,
+      "step": 3417
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.3426205356289075,
+      "learning_rate": 4.083954219741759e-06,
+      "loss": 0.5346,
+      "step": 3418
+    },
+    {
+      "epoch": 0.9117333333333333,
+      "grad_norm": 0.3729100291429102,
+      "learning_rate": 4.059556712329849e-06,
+      "loss": 0.5793,
+      "step": 3419
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3800997619534846,
+      "learning_rate": 4.035230788732447e-06,
+      "loss": 0.5434,
+      "step": 3420
+    },
+    {
+      "epoch": 0.9122666666666667,
+      "grad_norm": 0.3548511885515272,
+      "learning_rate": 4.010976467099781e-06,
+      "loss": 0.5415,
+      "step": 3421
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.34714705832546566,
+      "learning_rate": 3.986793765528696e-06,
+      "loss": 0.553,
+      "step": 3422
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.3201284214399404,
+      "learning_rate": 3.962682702062559e-06,
+      "loss": 0.5349,
+      "step": 3423
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.3361315237513386,
+      "learning_rate": 3.938643294691302e-06,
+      "loss": 0.5383,
+      "step": 3424
+    },
+    {
+      "epoch": 0.9133333333333333,
+      "grad_norm": 0.4288977807737274,
+      "learning_rate": 3.9146755613514e-06,
+      "loss": 0.5554,
+      "step": 3425
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.37507120678643363,
+      "learning_rate": 3.890779519925825e-06,
+      "loss": 0.6013,
+      "step": 3426
+    },
+    {
+      "epoch": 0.9138666666666667,
+      "grad_norm": 0.3352797784670776,
+      "learning_rate": 3.866955188244092e-06,
+      "loss": 0.5972,
+      "step": 3427
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.36911711740443326,
+      "learning_rate": 3.843202584082161e-06,
+      "loss": 0.5588,
+      "step": 3428
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.3357896324309692,
+      "learning_rate": 3.819521725162545e-06,
+      "loss": 0.5694,
+      "step": 3429
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.33953496253329435,
+      "learning_rate": 3.7959126291541637e-06,
+      "loss": 0.5681,
+      "step": 3430
+    },
+    {
+      "epoch": 0.9149333333333334,
+      "grad_norm": 0.37423766467800623,
+      "learning_rate": 3.772375313672427e-06,
+      "loss": 0.585,
+      "step": 3431
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.32183862915826045,
+      "learning_rate": 3.7489097962791653e-06,
+      "loss": 0.5933,
+      "step": 3432
+    },
+    {
+      "epoch": 0.9154666666666667,
+      "grad_norm": 0.32908688690867044,
+      "learning_rate": 3.7255160944826617e-06,
+      "loss": 0.5889,
+      "step": 3433
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.3885315153699735,
+      "learning_rate": 3.7021942257375984e-06,
+      "loss": 0.5675,
+      "step": 3434
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.3437928236652113,
+      "learning_rate": 3.6789442074450565e-06,
+      "loss": 0.5688,
+      "step": 3435
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.33830839924802564,
+      "learning_rate": 3.6557660569525253e-06,
+      "loss": 0.595,
+      "step": 3436
+    },
+    {
+      "epoch": 0.9165333333333333,
+      "grad_norm": 0.3668143129682812,
+      "learning_rate": 3.6326597915538608e-06,
+      "loss": 0.5922,
+      "step": 3437
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.3359897994039373,
+      "learning_rate": 3.6096254284892827e-06,
+      "loss": 0.5386,
+      "step": 3438
+    },
+    {
+      "epoch": 0.9170666666666667,
+      "grad_norm": 0.3452835638356792,
+      "learning_rate": 3.586662984945377e-06,
+      "loss": 0.5725,
+      "step": 3439
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.3319391224743454,
+      "learning_rate": 3.5637724780550385e-06,
+      "loss": 0.5487,
+      "step": 3440
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.3653469881855482,
+      "learning_rate": 3.5409539248975278e-06,
+      "loss": 0.5393,
+      "step": 3441
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.3591275259862968,
+      "learning_rate": 3.518207342498392e-06,
+      "loss": 0.5637,
+      "step": 3442
+    },
+    {
+      "epoch": 0.9181333333333334,
+      "grad_norm": 0.3500003388293157,
+      "learning_rate": 3.4955327478294665e-06,
+      "loss": 0.5789,
+      "step": 3443
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.35556080567542436,
+      "learning_rate": 3.472930157808907e-06,
+      "loss": 0.5455,
+      "step": 3444
+    },
+    {
+      "epoch": 0.9186666666666666,
+      "grad_norm": 0.4724456682388256,
+      "learning_rate": 3.4503995893011343e-06,
+      "loss": 0.5744,
+      "step": 3445
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.3525346173380394,
+      "learning_rate": 3.427941059116824e-06,
+      "loss": 0.5434,
+      "step": 3446
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.35242879127390236,
+      "learning_rate": 3.405554584012893e-06,
+      "loss": 0.5665,
+      "step": 3447
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.3660036809239432,
+      "learning_rate": 3.3832401806925262e-06,
+      "loss": 0.5814,
+      "step": 3448
+    },
+    {
+      "epoch": 0.9197333333333333,
+      "grad_norm": 0.32708289034734217,
+      "learning_rate": 3.3609978658051043e-06,
+      "loss": 0.5563,
+      "step": 3449
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.36244401931388587,
+      "learning_rate": 3.338827655946253e-06,
+      "loss": 0.5744,
+      "step": 3450
+    },
+    {
+      "epoch": 0.9202666666666667,
+      "grad_norm": 0.32825296256281666,
+      "learning_rate": 3.3167295676577505e-06,
+      "loss": 0.6007,
+      "step": 3451
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.3298422305151278,
+      "learning_rate": 3.294703617427608e-06,
+      "loss": 0.5179,
+      "step": 3452
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.34209999603948027,
+      "learning_rate": 3.272749821689991e-06,
+      "loss": 0.534,
+      "step": 3453
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.3387623866217589,
+      "learning_rate": 3.250868196825241e-06,
+      "loss": 0.55,
+      "step": 3454
+    },
+    {
+      "epoch": 0.9213333333333333,
+      "grad_norm": 0.35384094049418513,
+      "learning_rate": 3.22905875915982e-06,
+      "loss": 0.595,
+      "step": 3455
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.34834502979965165,
+      "learning_rate": 3.207321524966378e-06,
+      "loss": 0.5903,
+      "step": 3456
+    },
+    {
+      "epoch": 0.9218666666666666,
+      "grad_norm": 0.3466414436596609,
+      "learning_rate": 3.1856565104636415e-06,
+      "loss": 0.5848,
+      "step": 3457
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.34434766048466514,
+      "learning_rate": 3.1640637318165132e-06,
+      "loss": 0.5975,
+      "step": 3458
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.3352189395887659,
+      "learning_rate": 3.1425432051359173e-06,
+      "loss": 0.5592,
+      "step": 3459
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.33372040012220905,
+      "learning_rate": 3.121094946478942e-06,
+      "loss": 0.5199,
+      "step": 3460
+    },
+    {
+      "epoch": 0.9229333333333334,
+      "grad_norm": 0.32843513845379646,
+      "learning_rate": 3.0997189718487084e-06,
+      "loss": 0.5518,
+      "step": 3461
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.35533330705449157,
+      "learning_rate": 3.0784152971944368e-06,
+      "loss": 0.56,
+      "step": 3462
+    },
+    {
+      "epoch": 0.9234666666666667,
+      "grad_norm": 0.3474625912154599,
+      "learning_rate": 3.0571839384113786e-06,
+      "loss": 0.5805,
+      "step": 3463
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.3477743232208824,
+      "learning_rate": 3.036024911340829e-06,
+      "loss": 0.5119,
+      "step": 3464
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.3436521509737728,
+      "learning_rate": 3.0149382317701368e-06,
+      "loss": 0.5543,
+      "step": 3465
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.3658580228195742,
+      "learning_rate": 2.9939239154326613e-06,
+      "loss": 0.5663,
+      "step": 3466
+    },
+    {
+      "epoch": 0.9245333333333333,
+      "grad_norm": 0.3405350481687344,
+      "learning_rate": 2.9729819780077493e-06,
+      "loss": 0.5602,
+      "step": 3467
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3223376987274596,
+      "learning_rate": 2.9521124351207572e-06,
+      "loss": 0.5723,
+      "step": 3468
+    },
+    {
+      "epoch": 0.9250666666666667,
+      "grad_norm": 0.35046144031485543,
+      "learning_rate": 2.9313153023430407e-06,
+      "loss": 0.5565,
+      "step": 3469
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.35771389343211585,
+      "learning_rate": 2.910590595191898e-06,
+      "loss": 0.6168,
+      "step": 3470
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.3463009825761227,
+      "learning_rate": 2.8899383291306257e-06,
+      "loss": 0.568,
+      "step": 3471
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.36613794154936613,
+      "learning_rate": 2.8693585195684324e-06,
+      "loss": 0.5589,
+      "step": 3472
+    },
+    {
+      "epoch": 0.9261333333333334,
+      "grad_norm": 0.31227164196314305,
+      "learning_rate": 2.8488511818605124e-06,
+      "loss": 0.5427,
+      "step": 3473
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3359697325227988,
+      "learning_rate": 2.8284163313079146e-06,
+      "loss": 0.5235,
+      "step": 3474
+    },
+    {
+      "epoch": 0.9266666666666666,
+      "grad_norm": 0.34299822845409184,
+      "learning_rate": 2.8080539831576658e-06,
+      "loss": 0.5861,
+      "step": 3475
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.3273141045992717,
+      "learning_rate": 2.7877641526026785e-06,
+      "loss": 0.5558,
+      "step": 3476
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.3263508634726766,
+      "learning_rate": 2.767546854781744e-06,
+      "loss": 0.5396,
+      "step": 3477
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.33473440855531894,
+      "learning_rate": 2.747402104779562e-06,
+      "loss": 0.5447,
+      "step": 3478
+    },
+    {
+      "epoch": 0.9277333333333333,
+      "grad_norm": 0.342052373684546,
+      "learning_rate": 2.7273299176266863e-06,
+      "loss": 0.5933,
+      "step": 3479
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3533721872760103,
+      "learning_rate": 2.707330308299516e-06,
+      "loss": 0.5574,
+      "step": 3480
+    },
+    {
+      "epoch": 0.9282666666666667,
+      "grad_norm": 0.3549852413582081,
+      "learning_rate": 2.687403291720325e-06,
+      "loss": 0.5559,
+      "step": 3481
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.32512889753609175,
+      "learning_rate": 2.6675488827572093e-06,
+      "loss": 0.5529,
+      "step": 3482
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.34735416023315446,
+      "learning_rate": 2.647767096224063e-06,
+      "loss": 0.5616,
+      "step": 3483
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.3631690754339679,
+      "learning_rate": 2.6280579468806686e-06,
+      "loss": 0.6174,
+      "step": 3484
+    },
+    {
+      "epoch": 0.9293333333333333,
+      "grad_norm": 0.3216061865308382,
+      "learning_rate": 2.6084214494325523e-06,
+      "loss": 0.5526,
+      "step": 3485
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.3464061009823028,
+      "learning_rate": 2.5888576185310267e-06,
+      "loss": 0.5491,
+      "step": 3486
+    },
+    {
+      "epoch": 0.9298666666666666,
+      "grad_norm": 0.3342680712508537,
+      "learning_rate": 2.5693664687732266e-06,
+      "loss": 0.5373,
+      "step": 3487
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.3859268952964619,
+      "learning_rate": 2.5499480147020305e-06,
+      "loss": 0.5305,
+      "step": 3488
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.34716274196260805,
+      "learning_rate": 2.530602270806104e-06,
+      "loss": 0.5974,
+      "step": 3489
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.3504451834424654,
+      "learning_rate": 2.5113292515198007e-06,
+      "loss": 0.5384,
+      "step": 3490
+    },
+    {
+      "epoch": 0.9309333333333333,
+      "grad_norm": 0.372714929701282,
+      "learning_rate": 2.4921289712232842e-06,
+      "loss": 0.5627,
+      "step": 3491
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3503620334766579,
+      "learning_rate": 2.4730014442423954e-06,
+      "loss": 0.5518,
+      "step": 3492
+    },
+    {
+      "epoch": 0.9314666666666667,
+      "grad_norm": 0.3414162857666363,
+      "learning_rate": 2.453946684848718e-06,
+      "loss": 0.5636,
+      "step": 3493
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.34387143909760715,
+      "learning_rate": 2.434964707259535e-06,
+      "loss": 0.555,
+      "step": 3494
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.3280599696909218,
+      "learning_rate": 2.416055525637828e-06,
+      "loss": 0.544,
+      "step": 3495
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.3454103759731847,
+      "learning_rate": 2.397219154092245e-06,
+      "loss": 0.5963,
+      "step": 3496
+    },
+    {
+      "epoch": 0.9325333333333333,
+      "grad_norm": 0.34073031334530307,
+      "learning_rate": 2.3784556066771544e-06,
+      "loss": 0.5648,
+      "step": 3497
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.34052111005645935,
+      "learning_rate": 2.3597648973925137e-06,
+      "loss": 0.5897,
+      "step": 3498
+    },
+    {
+      "epoch": 0.9330666666666667,
+      "grad_norm": 0.32664114860937943,
+      "learning_rate": 2.3411470401840106e-06,
+      "loss": 0.5764,
+      "step": 3499
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.32474442746561094,
+      "learning_rate": 2.3226020489429232e-06,
+      "loss": 0.5717,
+      "step": 3500
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.3249231252319353,
+      "learning_rate": 2.3041299375062053e-06,
+      "loss": 0.551,
+      "step": 3501
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.37846352234449726,
+      "learning_rate": 2.285730719656376e-06,
+      "loss": 0.562,
+      "step": 3502
+    },
+    {
+      "epoch": 0.9341333333333334,
+      "grad_norm": 0.4046561955417867,
+      "learning_rate": 2.2674044091216317e-06,
+      "loss": 0.5555,
+      "step": 3503
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.34427098431840325,
+      "learning_rate": 2.2491510195757125e-06,
+      "loss": 0.5589,
+      "step": 3504
+    },
+    {
+      "epoch": 0.9346666666666666,
+      "grad_norm": 0.3429797432890164,
+      "learning_rate": 2.230970564638002e-06,
+      "loss": 0.5307,
+      "step": 3505
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.33760232775558985,
+      "learning_rate": 2.2128630578734156e-06,
+      "loss": 0.5706,
+      "step": 3506
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.34535329393130076,
+      "learning_rate": 2.1948285127924906e-06,
+      "loss": 0.5573,
+      "step": 3507
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.3639024855554393,
+      "learning_rate": 2.1768669428512745e-06,
+      "loss": 0.6062,
+      "step": 3508
+    },
+    {
+      "epoch": 0.9357333333333333,
+      "grad_norm": 0.33958797306627986,
+      "learning_rate": 2.1589783614513912e-06,
+      "loss": 0.5891,
+      "step": 3509
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3485710628549526,
+      "learning_rate": 2.1411627819400316e-06,
+      "loss": 0.5294,
+      "step": 3510
+    },
+    {
+      "epoch": 0.9362666666666667,
+      "grad_norm": 0.3560571091250747,
+      "learning_rate": 2.123420217609862e-06,
+      "loss": 0.5774,
+      "step": 3511
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.34546401188489917,
+      "learning_rate": 2.1057506816991257e-06,
+      "loss": 0.6323,
+      "step": 3512
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.332337997169067,
+      "learning_rate": 2.0881541873915335e-06,
+      "loss": 0.5474,
+      "step": 3513
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.3486177808451853,
+      "learning_rate": 2.0706307478163157e-06,
+      "loss": 0.5797,
+      "step": 3514
+    },
+    {
+      "epoch": 0.9373333333333334,
+      "grad_norm": 0.35016015941121903,
+      "learning_rate": 2.0531803760482026e-06,
+      "loss": 0.5521,
+      "step": 3515
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.32121514868022516,
+      "learning_rate": 2.0358030851073908e-06,
+      "loss": 0.5464,
+      "step": 3516
+    },
+    {
+      "epoch": 0.9378666666666666,
+      "grad_norm": 0.31798881260197637,
+      "learning_rate": 2.0184988879595635e-06,
+      "loss": 0.5308,
+      "step": 3517
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.3293406600497602,
+      "learning_rate": 2.0012677975158488e-06,
+      "loss": 0.5381,
+      "step": 3518
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.3176534896163629,
+      "learning_rate": 1.984109826632863e-06,
+      "loss": 0.5321,
+      "step": 3519
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.3311629912277586,
+      "learning_rate": 1.9670249881126202e-06,
+      "loss": 0.5084,
+      "step": 3520
+    },
+    {
+      "epoch": 0.9389333333333333,
+      "grad_norm": 0.3470416917579143,
+      "learning_rate": 1.9500132947026017e-06,
+      "loss": 0.5165,
+      "step": 3521
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.3631737789780499,
+      "learning_rate": 1.933074759095688e-06,
+      "loss": 0.6042,
+      "step": 3522
+    },
+    {
+      "epoch": 0.9394666666666667,
+      "grad_norm": 0.3613474421439833,
+      "learning_rate": 1.916209393930202e-06,
+      "loss": 0.601,
+      "step": 3523
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.3202828824288851,
+      "learning_rate": 1.8994172117898557e-06,
+      "loss": 0.5336,
+      "step": 3524
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.3630967793339104,
+      "learning_rate": 1.8826982252037606e-06,
+      "loss": 0.6005,
+      "step": 3525
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.3467614977155515,
+      "learning_rate": 1.8660524466464158e-06,
+      "loss": 0.559,
+      "step": 3526
+    },
+    {
+      "epoch": 0.9405333333333333,
+      "grad_norm": 0.31526517066045545,
+      "learning_rate": 1.8494798885376863e-06,
+      "loss": 0.572,
+      "step": 3527
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.32889037817569244,
+      "learning_rate": 1.8329805632428255e-06,
+      "loss": 0.5506,
+      "step": 3528
+    },
+    {
+      "epoch": 0.9410666666666667,
+      "grad_norm": 0.3332969614301865,
+      "learning_rate": 1.81655448307243e-06,
+      "loss": 0.5272,
+      "step": 3529
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.3361715357953492,
+      "learning_rate": 1.8002016602824635e-06,
+      "loss": 0.5445,
+      "step": 3530
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.4313126658557049,
+      "learning_rate": 1.7839221070741984e-06,
+      "loss": 0.561,
+      "step": 3531
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.3418637138104556,
+      "learning_rate": 1.7677158355942635e-06,
+      "loss": 0.5436,
+      "step": 3532
+    },
+    {
+      "epoch": 0.9421333333333334,
+      "grad_norm": 0.3355872413959369,
+      "learning_rate": 1.7515828579346194e-06,
+      "loss": 0.5624,
+      "step": 3533
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3245741371298666,
+      "learning_rate": 1.7355231861325261e-06,
+      "loss": 0.5463,
+      "step": 3534
+    },
+    {
+      "epoch": 0.9426666666666667,
+      "grad_norm": 0.3290236800807073,
+      "learning_rate": 1.7195368321705319e-06,
+      "loss": 0.5435,
+      "step": 3535
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.3513771202722227,
+      "learning_rate": 1.7036238079765178e-06,
+      "loss": 0.5709,
+      "step": 3536
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.3497099468242933,
+      "learning_rate": 1.6877841254236082e-06,
+      "loss": 0.5571,
+      "step": 3537
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.35291069476649944,
+      "learning_rate": 1.6720177963302497e-06,
+      "loss": 0.5829,
+      "step": 3538
+    },
+    {
+      "epoch": 0.9437333333333333,
+      "grad_norm": 0.3223102149956902,
+      "learning_rate": 1.6563248324600988e-06,
+      "loss": 0.5155,
+      "step": 3539
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3335795074723978,
+      "learning_rate": 1.640705245522156e-06,
+      "loss": 0.5325,
+      "step": 3540
+    },
+    {
+      "epoch": 0.9442666666666667,
+      "grad_norm": 0.33954714328334745,
+      "learning_rate": 1.6251590471705991e-06,
+      "loss": 0.5601,
+      "step": 3541
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.3448281605435717,
+      "learning_rate": 1.6096862490048935e-06,
+      "loss": 0.5373,
+      "step": 3542
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.3274949838020495,
+      "learning_rate": 1.594286862569694e-06,
+      "loss": 0.554,
+      "step": 3543
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.3307476735958091,
+      "learning_rate": 1.5789608993549421e-06,
+      "loss": 0.5821,
+      "step": 3544
+    },
+    {
+      "epoch": 0.9453333333333334,
+      "grad_norm": 0.3378389240492942,
+      "learning_rate": 1.5637083707957356e-06,
+      "loss": 0.5666,
+      "step": 3545
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.3563248431366006,
+      "learning_rate": 1.5485292882724156e-06,
+      "loss": 0.5548,
+      "step": 3546
+    },
+    {
+      "epoch": 0.9458666666666666,
+      "grad_norm": 0.32814676407780136,
+      "learning_rate": 1.5334236631105225e-06,
+      "loss": 0.5759,
+      "step": 3547
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.32902095074114185,
+      "learning_rate": 1.518391506580763e-06,
+      "loss": 0.5635,
+      "step": 3548
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.3405503610898305,
+      "learning_rate": 1.5034328298990652e-06,
+      "loss": 0.5489,
+      "step": 3549
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.3600760563968869,
+      "learning_rate": 1.48854764422649e-06,
+      "loss": 0.5474,
+      "step": 3550
+    },
+    {
+      "epoch": 0.9469333333333333,
+      "grad_norm": 0.34742946224123245,
+      "learning_rate": 1.473735960669309e-06,
+      "loss": 0.5635,
+      "step": 3551
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.33209614763239237,
+      "learning_rate": 1.4589977902789042e-06,
+      "loss": 0.5433,
+      "step": 3552
+    },
+    {
+      "epoch": 0.9474666666666667,
+      "grad_norm": 0.3457489799683696,
+      "learning_rate": 1.4443331440518459e-06,
+      "loss": 0.6113,
+      "step": 3553
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.3835404413535953,
+      "learning_rate": 1.4297420329298372e-06,
+      "loss": 0.6201,
+      "step": 3554
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.3641691198464094,
+      "learning_rate": 1.4152244677996918e-06,
+      "loss": 0.571,
+      "step": 3555
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.3322347513336434,
+      "learning_rate": 1.4007804594933672e-06,
+      "loss": 0.5121,
+      "step": 3556
+    },
+    {
+      "epoch": 0.9485333333333333,
+      "grad_norm": 0.3321959483135998,
+      "learning_rate": 1.3864100187879536e-06,
+      "loss": 0.5651,
+      "step": 3557
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.32269460844323217,
+      "learning_rate": 1.37211315640563e-06,
+      "loss": 0.5429,
+      "step": 3558
+    },
+    {
+      "epoch": 0.9490666666666666,
+      "grad_norm": 0.34910224704439924,
+      "learning_rate": 1.3578898830136633e-06,
+      "loss": 0.5747,
+      "step": 3559
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.3486414831020094,
+      "learning_rate": 1.3437402092244534e-06,
+      "loss": 0.5945,
+      "step": 3560
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.4003486975347935,
+      "learning_rate": 1.3296641455954438e-06,
+      "loss": 0.5657,
+      "step": 3561
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.3603519837752374,
+      "learning_rate": 1.3156617026291783e-06,
+      "loss": 0.5804,
+      "step": 3562
+    },
+    {
+      "epoch": 0.9501333333333334,
+      "grad_norm": 0.3484500566141622,
+      "learning_rate": 1.3017328907732774e-06,
+      "loss": 0.6287,
+      "step": 3563
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3534093927288817,
+      "learning_rate": 1.2878777204204052e-06,
+      "loss": 0.5885,
+      "step": 3564
+    },
+    {
+      "epoch": 0.9506666666666667,
+      "grad_norm": 0.3324863923939078,
+      "learning_rate": 1.2740962019082814e-06,
+      "loss": 0.5441,
+      "step": 3565
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.34486810729696377,
+      "learning_rate": 1.2603883455196918e-06,
+      "loss": 0.5297,
+      "step": 3566
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.35007800836673886,
+      "learning_rate": 1.246754161482433e-06,
+      "loss": 0.604,
+      "step": 3567
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.3568450938297453,
+      "learning_rate": 1.233193659969356e-06,
+      "loss": 0.557,
+      "step": 3568
+    },
+    {
+      "epoch": 0.9517333333333333,
+      "grad_norm": 0.35780339545413115,
+      "learning_rate": 1.2197068510983123e-06,
+      "loss": 0.5756,
+      "step": 3569
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.34446415802344327,
+      "learning_rate": 1.2062937449321853e-06,
+      "loss": 0.5437,
+      "step": 3570
+    },
+    {
+      "epoch": 0.9522666666666667,
+      "grad_norm": 0.34101359780530865,
+      "learning_rate": 1.192954351478881e-06,
+      "loss": 0.6003,
+      "step": 3571
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.3516004308518511,
+      "learning_rate": 1.1796886806912711e-06,
+      "loss": 0.5662,
+      "step": 3572
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.33560129676716577,
+      "learning_rate": 1.166496742467249e-06,
+      "loss": 0.6255,
+      "step": 3573
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.34565720758360824,
+      "learning_rate": 1.153378546649686e-06,
+      "loss": 0.5983,
+      "step": 3574
+    },
+    {
+      "epoch": 0.9533333333333334,
+      "grad_norm": 0.35761641574607006,
+      "learning_rate": 1.1403341030264192e-06,
+      "loss": 0.5413,
+      "step": 3575
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.33527998229172135,
+      "learning_rate": 1.1273634213302742e-06,
+      "loss": 0.5339,
+      "step": 3576
+    },
+    {
+      "epoch": 0.9538666666666666,
+      "grad_norm": 0.34193888567715813,
+      "learning_rate": 1.1144665112390317e-06,
+      "loss": 0.5899,
+      "step": 3577
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.3377647101175997,
+      "learning_rate": 1.101643382375439e-06,
+      "loss": 0.5485,
+      "step": 3578
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.34807809101284953,
+      "learning_rate": 1.088894044307176e-06,
+      "loss": 0.531,
+      "step": 3579
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.3483662790485872,
+      "learning_rate": 1.076218506546889e-06,
+      "loss": 0.603,
+      "step": 3580
+    },
+    {
+      "epoch": 0.9549333333333333,
+      "grad_norm": 0.3388854005197344,
+      "learning_rate": 1.0636167785521456e-06,
+      "loss": 0.5688,
+      "step": 3581
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.35006377953284007,
+      "learning_rate": 1.0510888697254362e-06,
+      "loss": 0.5441,
+      "step": 3582
+    },
+    {
+      "epoch": 0.9554666666666667,
+      "grad_norm": 0.35650236259787543,
+      "learning_rate": 1.0386347894141834e-06,
+      "loss": 0.56,
+      "step": 3583
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.35339656611204234,
+      "learning_rate": 1.026254546910721e-06,
+      "loss": 0.5895,
+      "step": 3584
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.35522206248276794,
+      "learning_rate": 1.013948151452282e-06,
+      "loss": 0.5541,
+      "step": 3585
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.45822286253643324,
+      "learning_rate": 1.0017156122210103e-06,
+      "loss": 0.5335,
+      "step": 3586
+    },
+    {
+      "epoch": 0.9565333333333333,
+      "grad_norm": 0.36208808271064635,
+      "learning_rate": 9.895569383439497e-07,
+      "loss": 0.5345,
+      "step": 3587
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.35193403727049133,
+      "learning_rate": 9.774721388930208e-07,
+      "loss": 0.5733,
+      "step": 3588
+    },
+    {
+      "epoch": 0.9570666666666666,
+      "grad_norm": 0.3439579316664456,
+      "learning_rate": 9.654612228850112e-07,
+      "loss": 0.6056,
+      "step": 3589
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.33881712346131376,
+      "learning_rate": 9.53524199281619e-07,
+      "loss": 0.5533,
+      "step": 3590
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.33976941180649134,
+      "learning_rate": 9.416610769893863e-07,
+      "loss": 0.5525,
+      "step": 3591
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.32874693623032547,
+      "learning_rate": 9.298718648596882e-07,
+      "loss": 0.5839,
+      "step": 3592
+    },
+    {
+      "epoch": 0.9581333333333333,
+      "grad_norm": 0.3587153259638571,
+      "learning_rate": 9.181565716888108e-07,
+      "loss": 0.5308,
+      "step": 3593
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.34000172209033314,
+      "learning_rate": 9.065152062178394e-07,
+      "loss": 0.5948,
+      "step": 3594
+    },
+    {
+      "epoch": 0.9586666666666667,
+      "grad_norm": 0.332819201446939,
+      "learning_rate": 8.949477771327375e-07,
+      "loss": 0.5709,
+      "step": 3595
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.36653112439664776,
+      "learning_rate": 8.834542930642564e-07,
+      "loss": 0.5342,
+      "step": 3596
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.35649140485076625,
+      "learning_rate": 8.720347625880365e-07,
+      "loss": 0.5857,
+      "step": 3597
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.3382152692497552,
+      "learning_rate": 8.606891942244843e-07,
+      "loss": 0.542,
+      "step": 3598
+    },
+    {
+      "epoch": 0.9597333333333333,
+      "grad_norm": 0.3592859145050308,
+      "learning_rate": 8.494175964388285e-07,
+      "loss": 0.565,
+      "step": 3599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3347940304545129,
+      "learning_rate": 8.382199776411526e-07,
+      "loss": 0.5408,
+      "step": 3600
+    },
+    {
+      "epoch": 0.9602666666666667,
+      "grad_norm": 0.3432807253611525,
+      "learning_rate": 8.270963461862735e-07,
+      "loss": 0.5235,
+      "step": 3601
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.3292055268567841,
+      "learning_rate": 8.160467103738744e-07,
+      "loss": 0.5691,
+      "step": 3602
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.37226392552353654,
+      "learning_rate": 8.050710784483606e-07,
+      "loss": 0.5794,
+      "step": 3603
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.3502730790953156,
+      "learning_rate": 7.941694585989812e-07,
+      "loss": 0.5478,
+      "step": 3604
+    },
+    {
+      "epoch": 0.9613333333333334,
+      "grad_norm": 0.3510793166853264,
+      "learning_rate": 7.833418589597297e-07,
+      "loss": 0.5409,
+      "step": 3605
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.35033417150917406,
+      "learning_rate": 7.72588287609366e-07,
+      "loss": 0.5365,
+      "step": 3606
+    },
+    {
+      "epoch": 0.9618666666666666,
+      "grad_norm": 0.36602968817644155,
+      "learning_rate": 7.619087525714385e-07,
+      "loss": 0.5996,
+      "step": 3607
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.34394524029132817,
+      "learning_rate": 7.51303261814229e-07,
+      "loss": 0.58,
+      "step": 3608
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.3646799709785778,
+      "learning_rate": 7.407718232508077e-07,
+      "loss": 0.6156,
+      "step": 3609
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.344240762525615,
+      "learning_rate": 7.303144447389554e-07,
+      "loss": 0.5627,
+      "step": 3610
+    },
+    {
+      "epoch": 0.9629333333333333,
+      "grad_norm": 0.33817371267390517,
+      "learning_rate": 7.199311340812087e-07,
+      "loss": 0.52,
+      "step": 3611
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3405187485431618,
+      "learning_rate": 7.096218990248593e-07,
+      "loss": 0.5541,
+      "step": 3612
+    },
+    {
+      "epoch": 0.9634666666666667,
+      "grad_norm": 0.33714832689249224,
+      "learning_rate": 6.993867472618987e-07,
+      "loss": 0.5505,
+      "step": 3613
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.36486171535355555,
+      "learning_rate": 6.892256864290625e-07,
+      "loss": 0.5901,
+      "step": 3614
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.3504758719446265,
+      "learning_rate": 6.791387241077973e-07,
+      "loss": 0.5875,
+      "step": 3615
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.34338427679710004,
+      "learning_rate": 6.691258678242607e-07,
+      "loss": 0.5188,
+      "step": 3616
+    },
+    {
+      "epoch": 0.9645333333333334,
+      "grad_norm": 0.3550936875702556,
+      "learning_rate": 6.591871250493209e-07,
+      "loss": 0.5492,
+      "step": 3617
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.3607325230619524,
+      "learning_rate": 6.493225031985573e-07,
+      "loss": 0.6269,
+      "step": 3618
+    },
+    {
+      "epoch": 0.9650666666666666,
+      "grad_norm": 0.3477495884654401,
+      "learning_rate": 6.395320096322266e-07,
+      "loss": 0.5934,
+      "step": 3619
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.3547980526668001,
+      "learning_rate": 6.298156516552967e-07,
+      "loss": 0.5718,
+      "step": 3620
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.34244665874709795,
+      "learning_rate": 6.201734365174017e-07,
+      "loss": 0.5463,
+      "step": 3621
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.34720102470117736,
+      "learning_rate": 6.106053714128757e-07,
+      "loss": 0.5417,
+      "step": 3622
+    },
+    {
+      "epoch": 0.9661333333333333,
+      "grad_norm": 0.340002079828364,
+      "learning_rate": 6.011114634807081e-07,
+      "loss": 0.5143,
+      "step": 3623
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.34122659002576333,
+      "learning_rate": 5.916917198045546e-07,
+      "loss": 0.5506,
+      "step": 3624
+    },
+    {
+      "epoch": 0.9666666666666667,
+      "grad_norm": 0.3595142029953279,
+      "learning_rate": 5.8234614741276e-07,
+      "loss": 0.6081,
+      "step": 3625
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.3379322322573059,
+      "learning_rate": 5.730747532783243e-07,
+      "loss": 0.5523,
+      "step": 3626
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.3296154389395013,
+      "learning_rate": 5.638775443188693e-07,
+      "loss": 0.5483,
+      "step": 3627
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.32713820569558444,
+      "learning_rate": 5.547545273966947e-07,
+      "loss": 0.5632,
+      "step": 3628
+    },
+    {
+      "epoch": 0.9677333333333333,
+      "grad_norm": 0.3482877643571333,
+      "learning_rate": 5.457057093187334e-07,
+      "loss": 0.5874,
+      "step": 3629
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3478564773327796,
+      "learning_rate": 5.367310968365624e-07,
+      "loss": 0.5422,
+      "step": 3630
+    },
+    {
+      "epoch": 0.9682666666666667,
+      "grad_norm": 0.33276209947772445,
+      "learning_rate": 5.278306966463919e-07,
+      "loss": 0.5686,
+      "step": 3631
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.33313289588087,
+      "learning_rate": 5.190045153890433e-07,
+      "loss": 0.5761,
+      "step": 3632
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.35060308437282434,
+      "learning_rate": 5.102525596499929e-07,
+      "loss": 0.6527,
+      "step": 3633
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.36335554292409006,
+      "learning_rate": 5.015748359592953e-07,
+      "loss": 0.6145,
+      "step": 3634
+    },
+    {
+      "epoch": 0.9693333333333334,
+      "grad_norm": 0.33928078851547144,
+      "learning_rate": 4.9297135079166e-07,
+      "loss": 0.6172,
+      "step": 3635
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.38826307780126884,
+      "learning_rate": 4.844421105663743e-07,
+      "loss": 0.5942,
+      "step": 3636
+    },
+    {
+      "epoch": 0.9698666666666667,
+      "grad_norm": 0.3525869725339327,
+      "learning_rate": 4.759871216473366e-07,
+      "loss": 0.552,
+      "step": 3637
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.34299810530404984,
+      "learning_rate": 4.676063903430561e-07,
+      "loss": 0.5806,
+      "step": 3638
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.3267379743822501,
+      "learning_rate": 4.5929992290661973e-07,
+      "loss": 0.5448,
+      "step": 3639
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.34606923723750227,
+      "learning_rate": 4.510677255357143e-07,
+      "loss": 0.5355,
+      "step": 3640
+    },
+    {
+      "epoch": 0.9709333333333333,
+      "grad_norm": 0.5412897534496282,
+      "learning_rate": 4.429098043726154e-07,
+      "loss": 0.5515,
+      "step": 3641
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3628427434950643,
+      "learning_rate": 4.3482616550416523e-07,
+      "loss": 0.5952,
+      "step": 3642
+    },
+    {
+      "epoch": 0.9714666666666667,
+      "grad_norm": 0.6060764455488781,
+      "learning_rate": 4.2681681496179457e-07,
+      "loss": 0.542,
+      "step": 3643
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.3617796677527885,
+      "learning_rate": 4.188817587215121e-07,
+      "loss": 0.545,
+      "step": 3644
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.34276116956772457,
+      "learning_rate": 4.110210027038597e-07,
+      "loss": 0.5529,
+      "step": 3645
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.3453367869788142,
+      "learning_rate": 4.0323455277397894e-07,
+      "loss": 0.5487,
+      "step": 3646
+    },
+    {
+      "epoch": 0.9725333333333334,
+      "grad_norm": 0.3601716163110895,
+      "learning_rate": 3.955224147415559e-07,
+      "loss": 0.5822,
+      "step": 3647
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.34331308511554115,
+      "learning_rate": 3.8788459436082115e-07,
+      "loss": 0.5282,
+      "step": 3648
+    },
+    {
+      "epoch": 0.9730666666666666,
+      "grad_norm": 0.3657599969727808,
+      "learning_rate": 3.803210973305715e-07,
+      "loss": 0.5712,
+      "step": 3649
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.34918736921749277,
+      "learning_rate": 3.7283192929412626e-07,
+      "loss": 0.577,
+      "step": 3650
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.3415858611957605,
+      "learning_rate": 3.654170958393821e-07,
+      "loss": 0.608,
+      "step": 3651
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.3464244819210767,
+      "learning_rate": 3.5807660249873585e-07,
+      "loss": 0.5926,
+      "step": 3652
+    },
+    {
+      "epoch": 0.9741333333333333,
+      "grad_norm": 0.35190125691584373,
+      "learning_rate": 3.508104547491509e-07,
+      "loss": 0.5972,
+      "step": 3653
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3345522118761501,
+      "learning_rate": 3.436186580120793e-07,
+      "loss": 0.5646,
+      "step": 3654
+    },
+    {
+      "epoch": 0.9746666666666667,
+      "grad_norm": 0.33590588885090034,
+      "learning_rate": 3.365012176535287e-07,
+      "loss": 0.5649,
+      "step": 3655
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.34206677560213433,
+      "learning_rate": 3.294581389840068e-07,
+      "loss": 0.5486,
+      "step": 3656
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.3459035480155328,
+      "learning_rate": 3.2248942725856545e-07,
+      "loss": 0.5584,
+      "step": 3657
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.35073761499048756,
+      "learning_rate": 3.155950876767455e-07,
+      "loss": 0.5758,
+      "step": 3658
+    },
+    {
+      "epoch": 0.9757333333333333,
+      "grad_norm": 0.32697923909304105,
+      "learning_rate": 3.087751253826099e-07,
+      "loss": 0.5099,
+      "step": 3659
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3504979255702691,
+      "learning_rate": 3.020295454647104e-07,
+      "loss": 0.5235,
+      "step": 3660
+    },
+    {
+      "epoch": 0.9762666666666666,
+      "grad_norm": 0.3601831993531229,
+      "learning_rate": 2.95358352956121e-07,
+      "loss": 0.5357,
+      "step": 3661
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.35176994725296307,
+      "learning_rate": 2.8876155283440455e-07,
+      "loss": 0.5917,
+      "step": 3662
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.33656028844879343,
+      "learning_rate": 2.822391500215904e-07,
+      "loss": 0.5416,
+      "step": 3663
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.33395575999512844,
+      "learning_rate": 2.757911493842524e-07,
+      "loss": 0.5674,
+      "step": 3664
+    },
+    {
+      "epoch": 0.9773333333333334,
+      "grad_norm": 0.3551203866045012,
+      "learning_rate": 2.694175557334089e-07,
+      "loss": 0.5939,
+      "step": 3665
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3379185779211055,
+      "learning_rate": 2.631183738245779e-07,
+      "loss": 0.5867,
+      "step": 3666
+    },
+    {
+      "epoch": 0.9778666666666667,
+      "grad_norm": 0.3464942105968554,
+      "learning_rate": 2.5689360835775557e-07,
+      "loss": 0.571,
+      "step": 3667
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.32517939912996274,
+      "learning_rate": 2.5074326397740435e-07,
+      "loss": 0.6034,
+      "step": 3668
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.3398040745586794,
+      "learning_rate": 2.446673452724646e-07,
+      "loss": 0.564,
+      "step": 3669
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.3491270842941653,
+      "learning_rate": 2.3866585677635446e-07,
+      "loss": 0.558,
+      "step": 3670
+    },
+    {
+      "epoch": 0.9789333333333333,
+      "grad_norm": 0.346518542501218,
+      "learning_rate": 2.327388029669586e-07,
+      "loss": 0.5784,
+      "step": 3671
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.33509570506283576,
+      "learning_rate": 2.2688618826659513e-07,
+      "loss": 0.5618,
+      "step": 3672
+    },
+    {
+      "epoch": 0.9794666666666667,
+      "grad_norm": 0.34748912641619945,
+      "learning_rate": 2.2110801704207097e-07,
+      "loss": 0.5719,
+      "step": 3673
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.35738002745102626,
+      "learning_rate": 2.1540429360463744e-07,
+      "loss": 0.5603,
+      "step": 3674
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.3400461056379536,
+      "learning_rate": 2.0977502221000145e-07,
+      "loss": 0.575,
+      "step": 3675
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.337193682785471,
+      "learning_rate": 2.0422020705832544e-07,
+      "loss": 0.5735,
+      "step": 3676
+    },
+    {
+      "epoch": 0.9805333333333334,
+      "grad_norm": 0.35670374797821,
+      "learning_rate": 1.9873985229419411e-07,
+      "loss": 0.5511,
+      "step": 3677
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.36498213833039267,
+      "learning_rate": 1.9333396200666988e-07,
+      "loss": 0.5674,
+      "step": 3678
+    },
+    {
+      "epoch": 0.9810666666666666,
+      "grad_norm": 0.34443444689075975,
+      "learning_rate": 1.8800254022922624e-07,
+      "loss": 0.5369,
+      "step": 3679
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.3397360982084764,
+      "learning_rate": 1.8274559093978126e-07,
+      "loss": 0.5746,
+      "step": 3680
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.3361284184682215,
+      "learning_rate": 1.7756311806069737e-07,
+      "loss": 0.5549,
+      "step": 3681
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.4169688985782199,
+      "learning_rate": 1.724551254587481e-07,
+      "loss": 0.549,
+      "step": 3682
+    },
+    {
+      "epoch": 0.9821333333333333,
+      "grad_norm": 0.34284336851101704,
+      "learning_rate": 1.6742161694516257e-07,
+      "loss": 0.5828,
+      "step": 3683
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.36003515900908467,
+      "learning_rate": 1.62462596275581e-07,
+      "loss": 0.5616,
+      "step": 3684
+    },
+    {
+      "epoch": 0.9826666666666667,
+      "grad_norm": 0.34646733660146567,
+      "learning_rate": 1.5757806715005487e-07,
+      "loss": 0.5276,
+      "step": 3685
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.33333869172869685,
+      "learning_rate": 1.5276803321307987e-07,
+      "loss": 0.5425,
+      "step": 3686
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.3305183354420717,
+      "learning_rate": 1.480324980535408e-07,
+      "loss": 0.5382,
+      "step": 3687
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.32742540141707405,
+      "learning_rate": 1.4337146520475575e-07,
+      "loss": 0.5509,
+      "step": 3688
+    },
+    {
+      "epoch": 0.9837333333333333,
+      "grad_norm": 0.3715852882148017,
+      "learning_rate": 1.3878493814445392e-07,
+      "loss": 0.5766,
+      "step": 3689
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.43519776841736296,
+      "learning_rate": 1.3427292029476458e-07,
+      "loss": 0.553,
+      "step": 3690
+    },
+    {
+      "epoch": 0.9842666666666666,
+      "grad_norm": 0.3547770909363482,
+      "learning_rate": 1.2983541502222807e-07,
+      "loss": 0.5794,
+      "step": 3691
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.35048216490954837,
+      "learning_rate": 1.2547242563780703e-07,
+      "loss": 0.5625,
+      "step": 3692
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.33134426730204997,
+      "learning_rate": 1.211839553968197e-07,
+      "loss": 0.5413,
+      "step": 3693
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.3427775147266325,
+      "learning_rate": 1.1697000749903986e-07,
+      "loss": 0.5914,
+      "step": 3694
+    },
+    {
+      "epoch": 0.9853333333333333,
+      "grad_norm": 0.36056732951662795,
+      "learning_rate": 1.1283058508858579e-07,
+      "loss": 0.5827,
+      "step": 3695
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3569851450856535,
+      "learning_rate": 1.0876569125400915e-07,
+      "loss": 0.5374,
+      "step": 3696
+    },
+    {
+      "epoch": 0.9858666666666667,
+      "grad_norm": 0.363256035234427,
+      "learning_rate": 1.0477532902823939e-07,
+      "loss": 0.5724,
+      "step": 3697
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.33085860463381517,
+      "learning_rate": 1.008595013885949e-07,
+      "loss": 0.5255,
+      "step": 3698
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.33548418337483943,
+      "learning_rate": 9.701821125678301e-08,
+      "loss": 0.564,
+      "step": 3699
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.3503141815481428,
+      "learning_rate": 9.325146149888886e-08,
+      "loss": 0.5855,
+      "step": 3700
+    },
+    {
+      "epoch": 0.9869333333333333,
+      "grad_norm": 0.33196088931707374,
+      "learning_rate": 8.955925492539763e-08,
+      "loss": 0.5484,
+      "step": 3701
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.352459366831373,
+      "learning_rate": 8.594159429117233e-08,
+      "loss": 0.5872,
+      "step": 3702
+    },
+    {
+      "epoch": 0.9874666666666667,
+      "grad_norm": 0.37196689024942703,
+      "learning_rate": 8.239848229543156e-08,
+      "loss": 0.5822,
+      "step": 3703
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.33349634027677055,
+      "learning_rate": 7.892992158179401e-08,
+      "loss": 0.5348,
+      "step": 3704
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.36103265082571734,
+      "learning_rate": 7.553591473825617e-08,
+      "loss": 0.5741,
+      "step": 3705
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.3456989224033128,
+      "learning_rate": 7.221646429718121e-08,
+      "loss": 0.5438,
+      "step": 3706
+    },
+    {
+      "epoch": 0.9885333333333334,
+      "grad_norm": 0.3572267712364149,
+      "learning_rate": 6.897157273528798e-08,
+      "loss": 0.6035,
+      "step": 3707
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.32692865411323563,
+      "learning_rate": 6.580124247370644e-08,
+      "loss": 0.5382,
+      "step": 3708
+    },
+    {
+      "epoch": 0.9890666666666666,
+      "grad_norm": 0.3572657522968857,
+      "learning_rate": 6.270547587787778e-08,
+      "loss": 0.5513,
+      "step": 3709
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.33975055889845546,
+      "learning_rate": 5.968427525765429e-08,
+      "loss": 0.5614,
+      "step": 3710
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.35863665738117934,
+      "learning_rate": 5.673764286724392e-08,
+      "loss": 0.5273,
+      "step": 3711
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.36455552879504405,
+      "learning_rate": 5.3865580905188005e-08,
+      "loss": 0.6047,
+      "step": 3712
+    },
+    {
+      "epoch": 0.9901333333333333,
+      "grad_norm": 0.3439122183945789,
+      "learning_rate": 5.106809151443903e-08,
+      "loss": 0.5605,
+      "step": 3713
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.33039237994533466,
+      "learning_rate": 4.834517678226069e-08,
+      "loss": 0.5595,
+      "step": 3714
+    },
+    {
+      "epoch": 0.9906666666666667,
+      "grad_norm": 0.3207122880754927,
+      "learning_rate": 4.569683874029451e-08,
+      "loss": 0.5578,
+      "step": 3715
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.3372882212816138,
+      "learning_rate": 4.3123079364559834e-08,
+      "loss": 0.5269,
+      "step": 3716
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.33359267062609543,
+      "learning_rate": 4.062390057538723e-08,
+      "loss": 0.5489,
+      "step": 3717
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.375741265348412,
+      "learning_rate": 3.819930423749618e-08,
+      "loss": 0.5574,
+      "step": 3718
+    },
+    {
+      "epoch": 0.9917333333333334,
+      "grad_norm": 0.3383230058068102,
+      "learning_rate": 3.5849292159928495e-08,
+      "loss": 0.5764,
+      "step": 3719
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3599320619740979,
+      "learning_rate": 3.3573866096114903e-08,
+      "loss": 0.6167,
+      "step": 3720
+    },
+    {
+      "epoch": 0.9922666666666666,
+      "grad_norm": 0.3268065053039836,
+      "learning_rate": 3.137302774379736e-08,
+      "loss": 0.5484,
+      "step": 3721
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.332945731946259,
+      "learning_rate": 2.9246778745095628e-08,
+      "loss": 0.556,
+      "step": 3722
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.3592303437059356,
+      "learning_rate": 2.7195120686451804e-08,
+      "loss": 0.5138,
+      "step": 3723
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.4313976678293077,
+      "learning_rate": 2.5218055098663594e-08,
+      "loss": 0.5876,
+      "step": 3724
+    },
+    {
+      "epoch": 0.9933333333333333,
+      "grad_norm": 0.3447874235855294,
+      "learning_rate": 2.331558345688434e-08,
+      "loss": 0.5888,
+      "step": 3725
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3290418583840294,
+      "learning_rate": 2.1487707180589677e-08,
+      "loss": 0.525,
+      "step": 3726
+    },
+    {
+      "epoch": 0.9938666666666667,
+      "grad_norm": 0.3790787818656205,
+      "learning_rate": 1.9734427633621987e-08,
+      "loss": 0.5406,
+      "step": 3727
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.33915403439916647,
+      "learning_rate": 1.8055746124134854e-08,
+      "loss": 0.5448,
+      "step": 3728
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.3571290661160332,
+      "learning_rate": 1.6451663904648584e-08,
+      "loss": 0.5222,
+      "step": 3729
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.3424379993643115,
+      "learning_rate": 1.4922182172016908e-08,
+      "loss": 0.5817,
+      "step": 3730
+    },
+    {
+      "epoch": 0.9949333333333333,
+      "grad_norm": 0.3461349164703394,
+      "learning_rate": 1.3467302067426969e-08,
+      "loss": 0.5614,
+      "step": 3731
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.35538227732488453,
+      "learning_rate": 1.2087024676388226e-08,
+      "loss": 0.5674,
+      "step": 3732
+    },
+    {
+      "epoch": 0.9954666666666667,
+      "grad_norm": 0.3822033804860934,
+      "learning_rate": 1.0781351028787967e-08,
+      "loss": 0.5911,
+      "step": 3733
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.32801843446391854,
+      "learning_rate": 9.55028209881359e-09,
+      "loss": 0.5777,
+      "step": 3734
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.34182533513928925,
+      "learning_rate": 8.393818804997012e-09,
+      "loss": 0.575,
+      "step": 3735
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.3425173015154349,
+      "learning_rate": 7.311962010214668e-09,
+      "loss": 0.5494,
+      "step": 3736
+    },
+    {
+      "epoch": 0.9965333333333334,
+      "grad_norm": 0.33321485466971074,
+      "learning_rate": 6.304712521665312e-09,
+      "loss": 0.5913,
+      "step": 3737
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.48496049359812166,
+      "learning_rate": 5.372071090892217e-09,
+      "loss": 0.582,
+      "step": 3738
+    },
+    {
+      "epoch": 0.9970666666666667,
+      "grad_norm": 0.3419797651284938,
+      "learning_rate": 4.514038413749866e-09,
+      "loss": 0.5868,
+      "step": 3739
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.3450179455877578,
+      "learning_rate": 3.730615130448367e-09,
+      "loss": 0.5628,
+      "step": 3740
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.3369608810674822,
+      "learning_rate": 3.0218018255312454e-09,
+      "loss": 0.5876,
+      "step": 3741
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.3213782094523305,
+      "learning_rate": 2.387599027853238e-09,
+      "loss": 0.536,
+      "step": 3742
+    },
+    {
+      "epoch": 0.9981333333333333,
+      "grad_norm": 0.3373379710366741,
+      "learning_rate": 1.8280072106025003e-09,
+      "loss": 0.5671,
+      "step": 3743
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.347081184033703,
+      "learning_rate": 1.3430267913228101e-09,
+      "loss": 0.6081,
+      "step": 3744
+    },
+    {
+      "epoch": 0.9986666666666667,
+      "grad_norm": 0.37614554963750674,
+      "learning_rate": 9.326581318691574e-10,
+      "loss": 0.6076,
+      "step": 3745
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.3391325586916799,
+      "learning_rate": 5.969015384188481e-10,
+      "loss": 0.5597,
+      "step": 3746
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.33087837723988733,
+      "learning_rate": 3.357572614937077e-10,
+      "loss": 0.5337,
+      "step": 3747
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.37229213635787656,
+      "learning_rate": 1.4922549594897916e-10,
+      "loss": 0.6559,
+      "step": 3748
+    },
+    {
+      "epoch": 0.9997333333333334,
+      "grad_norm": 0.3555346325789637,
+      "learning_rate": 3.7306380940016484e-11,
+      "loss": 0.5591,
+      "step": 3749
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3498437473308086,
+      "learning_rate": 0.0,
+      "loss": 0.6484,
+      "step": 3750
+    },
+    {
+      "epoch": 1.0,
+      "step": 3750,
+      "total_flos": 3338724519444480.0,
+      "train_loss": 0.6330405312856039,
+      "train_runtime": 59473.6136,
+      "train_samples_per_second": 1.009,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 3750,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3338724519444480.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a434797959fc324c5714ca1d99639fb0015e769e
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "v_proj",
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aff946595aedb16a0c7784ed5884c193c6cec4fa
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d04187663e5e91ab625841d2987affdd8dd038cc1ac8842ac0022817f7edebf
+size 671150064
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..61c6483d87895fb78ce096b61dc5269425f053c8
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5816ea9154f9cc287eeaf1955021709fdb1804c697d177601df873dc08573c5
+size 918507402
diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..42a63251765c58d2b0f1161b83917dbffd8c9bfc
--- /dev/null
+++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,39417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 5625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00017777777777777779,
+      "grad_norm": 1.258857068322721,
+      "learning_rate": 1.183431952662722e-06,
+      "loss": 1.5465,
+      "step": 1
+    },
+    {
+      "epoch": 0.00035555555555555557,
+      "grad_norm": 1.1499228169913636,
+      "learning_rate": 2.366863905325444e-06,
+      "loss": 1.58,
+      "step": 2
+    },
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 1.1777685176179222,
+      "learning_rate": 3.550295857988166e-06,
+      "loss": 1.5768,
+      "step": 3
+    },
+    {
+      "epoch": 0.0007111111111111111,
+      "grad_norm": 1.135862332672188,
+      "learning_rate": 4.733727810650888e-06,
+      "loss": 1.5312,
+      "step": 4
+    },
+    {
+      "epoch": 0.0008888888888888889,
+      "grad_norm": 1.1356684044513992,
+      "learning_rate": 5.917159763313609e-06,
+      "loss": 1.6033,
+      "step": 5
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.069577260780715,
+      "learning_rate": 7.100591715976332e-06,
+      "loss": 1.5259,
+      "step": 6
+    },
+    {
+      "epoch": 0.0012444444444444445,
+      "grad_norm": 1.1342025491596588,
+      "learning_rate": 8.284023668639054e-06,
+      "loss": 1.5418,
+      "step": 7
+    },
+    {
+      "epoch": 0.0014222222222222223,
+      "grad_norm": 0.9560259415069341,
+      "learning_rate": 9.467455621301776e-06,
+      "loss": 1.4498,
+      "step": 8
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9288261561394549,
+      "learning_rate": 1.0650887573964498e-05,
+      "loss": 1.4003,
+      "step": 9
+    },
+    {
+      "epoch": 0.0017777777777777779,
+      "grad_norm": 0.9685459518549487,
+      "learning_rate": 1.1834319526627219e-05,
+      "loss": 1.4169,
+      "step": 10
+    },
+    {
+      "epoch": 0.0019555555555555554,
+      "grad_norm": 0.9747858026297386,
+      "learning_rate": 1.3017751479289941e-05,
+      "loss": 1.3656,
+      "step": 11
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.9844283145066611,
+      "learning_rate": 1.4201183431952663e-05,
+      "loss": 1.2732,
+      "step": 12
+    },
+    {
+      "epoch": 0.002311111111111111,
+      "grad_norm": 1.0346465291991604,
+      "learning_rate": 1.5384615384615387e-05,
+      "loss": 1.2578,
+      "step": 13
+    },
+    {
+      "epoch": 0.002488888888888889,
+      "grad_norm": 0.9148693827208039,
+      "learning_rate": 1.6568047337278108e-05,
+      "loss": 1.1807,
+      "step": 14
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.9965068572355378,
+      "learning_rate": 1.7751479289940828e-05,
+      "loss": 1.1374,
+      "step": 15
+    },
+    {
+      "epoch": 0.0028444444444444446,
+      "grad_norm": 1.2870154033513308,
+      "learning_rate": 1.8934911242603552e-05,
+      "loss": 1.1152,
+      "step": 16
+    },
+    {
+      "epoch": 0.003022222222222222,
+      "grad_norm": 0.9480513594305368,
+      "learning_rate": 2.0118343195266273e-05,
+      "loss": 1.1463,
+      "step": 17
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8331421966271232,
+      "learning_rate": 2.1301775147928997e-05,
+      "loss": 1.0336,
+      "step": 18
+    },
+    {
+      "epoch": 0.0033777777777777777,
+      "grad_norm": 0.8228833984417278,
+      "learning_rate": 2.2485207100591717e-05,
+      "loss": 0.9945,
+      "step": 19
+    },
+    {
+      "epoch": 0.0035555555555555557,
+      "grad_norm": 0.8554826793933942,
+      "learning_rate": 2.3668639053254438e-05,
+      "loss": 0.9625,
+      "step": 20
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.836981544556156,
+      "learning_rate": 2.485207100591716e-05,
+      "loss": 0.9352,
+      "step": 21
+    },
+    {
+      "epoch": 0.003911111111111111,
+      "grad_norm": 0.7442301067026529,
+      "learning_rate": 2.6035502958579882e-05,
+      "loss": 0.9722,
+      "step": 22
+    },
+    {
+      "epoch": 0.004088888888888889,
+      "grad_norm": 0.7223817516900877,
+      "learning_rate": 2.7218934911242606e-05,
+      "loss": 0.957,
+      "step": 23
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.6851216084244861,
+      "learning_rate": 2.8402366863905327e-05,
+      "loss": 0.9225,
+      "step": 24
+    },
+    {
+      "epoch": 0.0044444444444444444,
+      "grad_norm": 0.6173778339952876,
+      "learning_rate": 2.958579881656805e-05,
+      "loss": 0.9219,
+      "step": 25
+    },
+    {
+      "epoch": 0.004622222222222222,
+      "grad_norm": 0.6145931846767859,
+      "learning_rate": 3.0769230769230774e-05,
+      "loss": 0.89,
+      "step": 26
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.5595801943750954,
+      "learning_rate": 3.195266272189349e-05,
+      "loss": 0.8878,
+      "step": 27
+    },
+    {
+      "epoch": 0.004977777777777778,
+      "grad_norm": 0.5900905880559347,
+      "learning_rate": 3.3136094674556215e-05,
+      "loss": 0.8746,
+      "step": 28
+    },
+    {
+      "epoch": 0.005155555555555556,
+      "grad_norm": 0.5450578628301268,
+      "learning_rate": 3.431952662721893e-05,
+      "loss": 0.925,
+      "step": 29
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.6261420506557391,
+      "learning_rate": 3.5502958579881656e-05,
+      "loss": 0.927,
+      "step": 30
+    },
+    {
+      "epoch": 0.005511111111111111,
+      "grad_norm": 0.5369435958949663,
+      "learning_rate": 3.668639053254438e-05,
+      "loss": 0.8643,
+      "step": 31
+    },
+    {
+      "epoch": 0.005688888888888889,
+      "grad_norm": 0.55770743392252,
+      "learning_rate": 3.7869822485207104e-05,
+      "loss": 0.8997,
+      "step": 32
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.5585332480788119,
+      "learning_rate": 3.905325443786982e-05,
+      "loss": 0.9353,
+      "step": 33
+    },
+    {
+      "epoch": 0.006044444444444444,
+      "grad_norm": 0.5315619567883817,
+      "learning_rate": 4.0236686390532545e-05,
+      "loss": 0.8873,
+      "step": 34
+    },
+    {
+      "epoch": 0.006222222222222222,
+      "grad_norm": 0.5151717959806205,
+      "learning_rate": 4.142011834319527e-05,
+      "loss": 0.8322,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.5274177891190261,
+      "learning_rate": 4.260355029585799e-05,
+      "loss": 0.8908,
+      "step": 36
+    },
+    {
+      "epoch": 0.006577777777777778,
+      "grad_norm": 0.4889475833535269,
+      "learning_rate": 4.378698224852072e-05,
+      "loss": 0.8288,
+      "step": 37
+    },
+    {
+      "epoch": 0.0067555555555555554,
+      "grad_norm": 0.5042479504468477,
+      "learning_rate": 4.4970414201183434e-05,
+      "loss": 0.8606,
+      "step": 38
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.5168725063826504,
+      "learning_rate": 4.615384615384616e-05,
+      "loss": 0.8608,
+      "step": 39
+    },
+    {
+      "epoch": 0.0071111111111111115,
+      "grad_norm": 0.49628507591003773,
+      "learning_rate": 4.7337278106508875e-05,
+      "loss": 0.8632,
+      "step": 40
+    },
+    {
+      "epoch": 0.007288888888888889,
+      "grad_norm": 0.501678094318914,
+      "learning_rate": 4.85207100591716e-05,
+      "loss": 0.8007,
+      "step": 41
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.49661664456150134,
+      "learning_rate": 4.970414201183432e-05,
+      "loss": 0.8357,
+      "step": 42
+    },
+    {
+      "epoch": 0.007644444444444444,
+      "grad_norm": 1.7448635336781273,
+      "learning_rate": 5.088757396449705e-05,
+      "loss": 0.786,
+      "step": 43
+    },
+    {
+      "epoch": 0.007822222222222222,
+      "grad_norm": 0.5126282855810396,
+      "learning_rate": 5.2071005917159764e-05,
+      "loss": 0.8559,
+      "step": 44
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.4681780693228782,
+      "learning_rate": 5.3254437869822495e-05,
+      "loss": 0.8096,
+      "step": 45
+    },
+    {
+      "epoch": 0.008177777777777779,
+      "grad_norm": 0.4830587733085102,
+      "learning_rate": 5.443786982248521e-05,
+      "loss": 0.7422,
+      "step": 46
+    },
+    {
+      "epoch": 0.008355555555555555,
+      "grad_norm": 0.49739529758924983,
+      "learning_rate": 5.562130177514793e-05,
+      "loss": 0.8457,
+      "step": 47
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.49131119990466243,
+      "learning_rate": 5.680473372781065e-05,
+      "loss": 0.8032,
+      "step": 48
+    },
+    {
+      "epoch": 0.00871111111111111,
+      "grad_norm": 0.5181650685005122,
+      "learning_rate": 5.798816568047337e-05,
+      "loss": 0.8473,
+      "step": 49
+    },
+    {
+      "epoch": 0.008888888888888889,
+      "grad_norm": 0.5587762171778651,
+      "learning_rate": 5.91715976331361e-05,
+      "loss": 0.7942,
+      "step": 50
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.5089348340184763,
+      "learning_rate": 6.035502958579882e-05,
+      "loss": 0.8363,
+      "step": 51
+    },
+    {
+      "epoch": 0.009244444444444444,
+      "grad_norm": 0.507227980739972,
+      "learning_rate": 6.153846153846155e-05,
+      "loss": 0.8142,
+      "step": 52
+    },
+    {
+      "epoch": 0.009422222222222222,
+      "grad_norm": 0.47958342316458724,
+      "learning_rate": 6.272189349112427e-05,
+      "loss": 0.7735,
+      "step": 53
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.4722450857881361,
+      "learning_rate": 6.390532544378698e-05,
+      "loss": 0.7868,
+      "step": 54
+    },
+    {
+      "epoch": 0.009777777777777778,
+      "grad_norm": 0.5322210858958987,
+      "learning_rate": 6.50887573964497e-05,
+      "loss": 0.7873,
+      "step": 55
+    },
+    {
+      "epoch": 0.009955555555555556,
+      "grad_norm": 0.49645778251385503,
+      "learning_rate": 6.627218934911243e-05,
+      "loss": 0.8828,
+      "step": 56
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.5080150556428977,
+      "learning_rate": 6.745562130177515e-05,
+      "loss": 0.8993,
+      "step": 57
+    },
+    {
+      "epoch": 0.010311111111111111,
+      "grad_norm": 0.4899449502743537,
+      "learning_rate": 6.863905325443787e-05,
+      "loss": 0.7542,
+      "step": 58
+    },
+    {
+      "epoch": 0.01048888888888889,
+      "grad_norm": 0.5204418152838295,
+      "learning_rate": 6.98224852071006e-05,
+      "loss": 0.7403,
+      "step": 59
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.4859982853059265,
+      "learning_rate": 7.100591715976331e-05,
+      "loss": 0.7863,
+      "step": 60
+    },
+    {
+      "epoch": 0.010844444444444445,
+      "grad_norm": 0.5183938626677759,
+      "learning_rate": 7.218934911242604e-05,
+      "loss": 0.8175,
+      "step": 61
+    },
+    {
+      "epoch": 0.011022222222222221,
+      "grad_norm": 0.49081102741303423,
+      "learning_rate": 7.337278106508876e-05,
+      "loss": 0.8055,
+      "step": 62
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.4925073336413384,
+      "learning_rate": 7.455621301775149e-05,
+      "loss": 0.8165,
+      "step": 63
+    },
+    {
+      "epoch": 0.011377777777777778,
+      "grad_norm": 0.46534238114233867,
+      "learning_rate": 7.573964497041421e-05,
+      "loss": 0.7796,
+      "step": 64
+    },
+    {
+      "epoch": 0.011555555555555555,
+      "grad_norm": 0.49748346282405465,
+      "learning_rate": 7.692307692307693e-05,
+      "loss": 0.8165,
+      "step": 65
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.4535062510331349,
+      "learning_rate": 7.810650887573964e-05,
+      "loss": 0.7805,
+      "step": 66
+    },
+    {
+      "epoch": 0.011911111111111112,
+      "grad_norm": 0.4406028466683985,
+      "learning_rate": 7.928994082840237e-05,
+      "loss": 0.7858,
+      "step": 67
+    },
+    {
+      "epoch": 0.012088888888888889,
+      "grad_norm": 0.46990100693978115,
+      "learning_rate": 8.047337278106509e-05,
+      "loss": 0.7743,
+      "step": 68
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.4637431264987931,
+      "learning_rate": 8.165680473372781e-05,
+      "loss": 0.8224,
+      "step": 69
+    },
+    {
+      "epoch": 0.012444444444444444,
+      "grad_norm": 0.46791205044756595,
+      "learning_rate": 8.284023668639054e-05,
+      "loss": 0.813,
+      "step": 70
+    },
+    {
+      "epoch": 0.012622222222222222,
+      "grad_norm": 0.4630866830977058,
+      "learning_rate": 8.402366863905326e-05,
+      "loss": 0.7565,
+      "step": 71
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.4754904662743993,
+      "learning_rate": 8.520710059171599e-05,
+      "loss": 0.8047,
+      "step": 72
+    },
+    {
+      "epoch": 0.012977777777777777,
+      "grad_norm": 0.4929498834422392,
+      "learning_rate": 8.63905325443787e-05,
+      "loss": 0.7989,
+      "step": 73
+    },
+    {
+      "epoch": 0.013155555555555556,
+      "grad_norm": 0.46023194279728535,
+      "learning_rate": 8.757396449704143e-05,
+      "loss": 0.822,
+      "step": 74
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.4677139906887722,
+      "learning_rate": 8.875739644970414e-05,
+      "loss": 0.8203,
+      "step": 75
+    },
+    {
+      "epoch": 0.013511111111111111,
+      "grad_norm": 0.46126163647133056,
+      "learning_rate": 8.994082840236687e-05,
+      "loss": 0.744,
+      "step": 76
+    },
+    {
+      "epoch": 0.01368888888888889,
+      "grad_norm": 0.4670292874892004,
+      "learning_rate": 9.112426035502959e-05,
+      "loss": 0.8402,
+      "step": 77
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.5125656023075831,
+      "learning_rate": 9.230769230769232e-05,
+      "loss": 0.7985,
+      "step": 78
+    },
+    {
+      "epoch": 0.014044444444444444,
+      "grad_norm": 0.4775962168591392,
+      "learning_rate": 9.349112426035503e-05,
+      "loss": 0.723,
+      "step": 79
+    },
+    {
+      "epoch": 0.014222222222222223,
+      "grad_norm": 0.4901539088108059,
+      "learning_rate": 9.467455621301775e-05,
+      "loss": 0.7958,
+      "step": 80
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.4813658150580067,
+      "learning_rate": 9.585798816568048e-05,
+      "loss": 0.8529,
+      "step": 81
+    },
+    {
+      "epoch": 0.014577777777777778,
+      "grad_norm": 0.46073059515040743,
+      "learning_rate": 9.70414201183432e-05,
+      "loss": 0.7186,
+      "step": 82
+    },
+    {
+      "epoch": 0.014755555555555555,
+      "grad_norm": 0.43371134510796305,
+      "learning_rate": 9.822485207100593e-05,
+      "loss": 0.7143,
+      "step": 83
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.48419715159203247,
+      "learning_rate": 9.940828402366865e-05,
+      "loss": 0.8002,
+      "step": 84
+    },
+    {
+      "epoch": 0.015111111111111112,
+      "grad_norm": 0.4866592780490299,
+      "learning_rate": 0.00010059171597633136,
+      "loss": 0.7745,
+      "step": 85
+    },
+    {
+      "epoch": 0.015288888888888888,
+      "grad_norm": 0.44829557184744384,
+      "learning_rate": 0.0001017751479289941,
+      "loss": 0.7724,
+      "step": 86
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.48930248103919116,
+      "learning_rate": 0.0001029585798816568,
+      "loss": 0.7675,
+      "step": 87
+    },
+    {
+      "epoch": 0.015644444444444443,
+      "grad_norm": 0.44046392908694915,
+      "learning_rate": 0.00010414201183431953,
+      "loss": 0.8224,
+      "step": 88
+    },
+    {
+      "epoch": 0.015822222222222224,
+      "grad_norm": 0.4785048069123892,
+      "learning_rate": 0.00010532544378698226,
+      "loss": 0.7931,
+      "step": 89
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.44280483377299734,
+      "learning_rate": 0.00010650887573964499,
+      "loss": 0.7943,
+      "step": 90
+    },
+    {
+      "epoch": 0.016177777777777777,
+      "grad_norm": 0.4284224951394098,
+      "learning_rate": 0.0001076923076923077,
+      "loss": 0.7845,
+      "step": 91
+    },
+    {
+      "epoch": 0.016355555555555557,
+      "grad_norm": 0.4561417148963684,
+      "learning_rate": 0.00010887573964497042,
+      "loss": 0.872,
+      "step": 92
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.4292856846799266,
+      "learning_rate": 0.00011005917159763315,
+      "loss": 0.7981,
+      "step": 93
+    },
+    {
+      "epoch": 0.01671111111111111,
+      "grad_norm": 0.468177963284816,
+      "learning_rate": 0.00011124260355029586,
+      "loss": 0.7708,
+      "step": 94
+    },
+    {
+      "epoch": 0.016888888888888887,
+      "grad_norm": 0.46163788094220537,
+      "learning_rate": 0.00011242603550295858,
+      "loss": 0.7934,
+      "step": 95
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.5379963430741206,
+      "learning_rate": 0.0001136094674556213,
+      "loss": 0.7567,
+      "step": 96
+    },
+    {
+      "epoch": 0.017244444444444444,
+      "grad_norm": 0.4711512215552466,
+      "learning_rate": 0.00011479289940828404,
+      "loss": 0.8325,
+      "step": 97
+    },
+    {
+      "epoch": 0.01742222222222222,
+      "grad_norm": 0.4680378040138463,
+      "learning_rate": 0.00011597633136094674,
+      "loss": 0.836,
+      "step": 98
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.41731652982019085,
+      "learning_rate": 0.00011715976331360947,
+      "loss": 0.7679,
+      "step": 99
+    },
+    {
+      "epoch": 0.017777777777777778,
+      "grad_norm": 0.435358473615698,
+      "learning_rate": 0.0001183431952662722,
+      "loss": 0.7828,
+      "step": 100
+    },
+    {
+      "epoch": 0.017955555555555554,
+      "grad_norm": 0.42495852158320324,
+      "learning_rate": 0.00011952662721893493,
+      "loss": 0.7832,
+      "step": 101
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.44184873982043654,
+      "learning_rate": 0.00012071005917159764,
+      "loss": 0.7487,
+      "step": 102
+    },
+    {
+      "epoch": 0.01831111111111111,
+      "grad_norm": 0.4271562627615471,
+      "learning_rate": 0.00012189349112426037,
+      "loss": 0.7724,
+      "step": 103
+    },
+    {
+      "epoch": 0.018488888888888888,
+      "grad_norm": 0.44780279081120256,
+      "learning_rate": 0.0001230769230769231,
+      "loss": 0.7949,
+      "step": 104
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.45176823188045945,
+      "learning_rate": 0.0001242603550295858,
+      "loss": 0.7299,
+      "step": 105
+    },
+    {
+      "epoch": 0.018844444444444445,
+      "grad_norm": 0.4234941506791437,
+      "learning_rate": 0.00012544378698224853,
+      "loss": 0.7299,
+      "step": 106
+    },
+    {
+      "epoch": 0.01902222222222222,
+      "grad_norm": 0.46699578697864047,
+      "learning_rate": 0.00012662721893491125,
+      "loss": 0.7915,
+      "step": 107
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.47310759599204344,
+      "learning_rate": 0.00012781065088757397,
+      "loss": 0.7832,
+      "step": 108
+    },
+    {
+      "epoch": 0.01937777777777778,
+      "grad_norm": 0.461824405614306,
+      "learning_rate": 0.00012899408284023668,
+      "loss": 0.7689,
+      "step": 109
+    },
+    {
+      "epoch": 0.019555555555555555,
+      "grad_norm": 0.47505870701737424,
+      "learning_rate": 0.0001301775147928994,
+      "loss": 0.8128,
+      "step": 110
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.45568262964730294,
+      "learning_rate": 0.00013136094674556214,
+      "loss": 0.7069,
+      "step": 111
+    },
+    {
+      "epoch": 0.019911111111111112,
+      "grad_norm": 0.44379721917882264,
+      "learning_rate": 0.00013254437869822486,
+      "loss": 0.741,
+      "step": 112
+    },
+    {
+      "epoch": 0.02008888888888889,
+      "grad_norm": 0.45064494937433286,
+      "learning_rate": 0.00013372781065088758,
+      "loss": 0.7739,
+      "step": 113
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.43217563863386493,
+      "learning_rate": 0.0001349112426035503,
+      "loss": 0.7458,
+      "step": 114
+    },
+    {
+      "epoch": 0.020444444444444446,
+      "grad_norm": 0.4198571594139149,
+      "learning_rate": 0.00013609467455621304,
+      "loss": 0.6837,
+      "step": 115
+    },
+    {
+      "epoch": 0.020622222222222222,
+      "grad_norm": 0.5378757543769198,
+      "learning_rate": 0.00013727810650887573,
+      "loss": 0.7717,
+      "step": 116
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.45219385363053144,
+      "learning_rate": 0.00013846153846153847,
+      "loss": 0.7895,
+      "step": 117
+    },
+    {
+      "epoch": 0.02097777777777778,
+      "grad_norm": 0.43258201378316113,
+      "learning_rate": 0.0001396449704142012,
+      "loss": 0.8123,
+      "step": 118
+    },
+    {
+      "epoch": 0.021155555555555556,
+      "grad_norm": 0.44382187341418,
+      "learning_rate": 0.0001408284023668639,
+      "loss": 0.7333,
+      "step": 119
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.45541998194172073,
+      "learning_rate": 0.00014201183431952663,
+      "loss": 0.8149,
+      "step": 120
+    },
+    {
+      "epoch": 0.021511111111111113,
+      "grad_norm": 0.40917327672060544,
+      "learning_rate": 0.00014319526627218934,
+      "loss": 0.7622,
+      "step": 121
+    },
+    {
+      "epoch": 0.02168888888888889,
+      "grad_norm": 0.4129317001299444,
+      "learning_rate": 0.0001443786982248521,
+      "loss": 0.7548,
+      "step": 122
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.46956772253344264,
+      "learning_rate": 0.0001455621301775148,
+      "loss": 0.7787,
+      "step": 123
+    },
+    {
+      "epoch": 0.022044444444444443,
+      "grad_norm": 0.4411855560722539,
+      "learning_rate": 0.00014674556213017752,
+      "loss": 0.708,
+      "step": 124
+    },
+    {
+      "epoch": 0.022222222222222223,
+      "grad_norm": 0.4542310213696035,
+      "learning_rate": 0.00014792899408284024,
+      "loss": 0.746,
+      "step": 125
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.4410051734654425,
+      "learning_rate": 0.00014911242603550298,
+      "loss": 0.6443,
+      "step": 126
+    },
+    {
+      "epoch": 0.022577777777777776,
+      "grad_norm": 0.44538375957173787,
+      "learning_rate": 0.00015029585798816567,
+      "loss": 0.7591,
+      "step": 127
+    },
+    {
+      "epoch": 0.022755555555555557,
+      "grad_norm": 0.4768115952145477,
+      "learning_rate": 0.00015147928994082842,
+      "loss": 0.7634,
+      "step": 128
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.4310921521908967,
+      "learning_rate": 0.00015266272189349113,
+      "loss": 0.7354,
+      "step": 129
+    },
+    {
+      "epoch": 0.02311111111111111,
+      "grad_norm": 0.4219097353849593,
+      "learning_rate": 0.00015384615384615385,
+      "loss": 0.7576,
+      "step": 130
+    },
+    {
+      "epoch": 0.02328888888888889,
+      "grad_norm": 0.4493492763866972,
+      "learning_rate": 0.00015502958579881657,
+      "loss": 0.7398,
+      "step": 131
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.44449137191974447,
+      "learning_rate": 0.00015621301775147929,
+      "loss": 0.7274,
+      "step": 132
+    },
+    {
+      "epoch": 0.023644444444444444,
+      "grad_norm": 0.4437401335874507,
+      "learning_rate": 0.00015739644970414203,
+      "loss": 0.7174,
+      "step": 133
+    },
+    {
+      "epoch": 0.023822222222222224,
+      "grad_norm": 0.4601812889711194,
+      "learning_rate": 0.00015857988165680475,
+      "loss": 0.7531,
+      "step": 134
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4128429759868119,
+      "learning_rate": 0.00015976331360946746,
+      "loss": 0.7219,
+      "step": 135
+    },
+    {
+      "epoch": 0.024177777777777777,
+      "grad_norm": 0.4376573688898078,
+      "learning_rate": 0.00016094674556213018,
+      "loss": 0.7796,
+      "step": 136
+    },
+    {
+      "epoch": 0.024355555555555554,
+      "grad_norm": 0.42645604923791425,
+      "learning_rate": 0.00016213017751479293,
+      "loss": 0.725,
+      "step": 137
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.44839644907574444,
+      "learning_rate": 0.00016331360946745562,
+      "loss": 0.7674,
+      "step": 138
+    },
+    {
+      "epoch": 0.02471111111111111,
+      "grad_norm": 0.44763816117785193,
+      "learning_rate": 0.00016449704142011836,
+      "loss": 0.7916,
+      "step": 139
+    },
+    {
+      "epoch": 0.024888888888888887,
+      "grad_norm": 0.4762683216146698,
+      "learning_rate": 0.00016568047337278108,
+      "loss": 0.7129,
+      "step": 140
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.4974991361000172,
+      "learning_rate": 0.0001668639053254438,
+      "loss": 0.7364,
+      "step": 141
+    },
+    {
+      "epoch": 0.025244444444444444,
+      "grad_norm": 0.4176301140459096,
+      "learning_rate": 0.0001680473372781065,
+      "loss": 0.7523,
+      "step": 142
+    },
+    {
+      "epoch": 0.02542222222222222,
+      "grad_norm": 0.4468545379435327,
+      "learning_rate": 0.00016923076923076923,
+      "loss": 0.7662,
+      "step": 143
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.43655264981596725,
+      "learning_rate": 0.00017041420118343197,
+      "loss": 0.7323,
+      "step": 144
+    },
+    {
+      "epoch": 0.025777777777777778,
+      "grad_norm": 0.45957397871169015,
+      "learning_rate": 0.0001715976331360947,
+      "loss": 0.7618,
+      "step": 145
+    },
+    {
+      "epoch": 0.025955555555555555,
+      "grad_norm": 0.4282793392026257,
+      "learning_rate": 0.0001727810650887574,
+      "loss": 0.8245,
+      "step": 146
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.4365734245351505,
+      "learning_rate": 0.00017396449704142012,
+      "loss": 0.7666,
+      "step": 147
+    },
+    {
+      "epoch": 0.02631111111111111,
+      "grad_norm": 0.4110531659484318,
+      "learning_rate": 0.00017514792899408287,
+      "loss": 0.7695,
+      "step": 148
+    },
+    {
+      "epoch": 0.026488888888888888,
+      "grad_norm": 0.4548179603502384,
+      "learning_rate": 0.00017633136094674556,
+      "loss": 0.7664,
+      "step": 149
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.4684089371665446,
+      "learning_rate": 0.00017751479289940828,
+      "loss": 0.7695,
+      "step": 150
+    },
+    {
+      "epoch": 0.026844444444444445,
+      "grad_norm": 0.4667143348446463,
+      "learning_rate": 0.00017869822485207102,
+      "loss": 0.7943,
+      "step": 151
+    },
+    {
+      "epoch": 0.027022222222222222,
+      "grad_norm": 1.7980695513854181,
+      "learning_rate": 0.00017988165680473374,
+      "loss": 0.7294,
+      "step": 152
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.42960716648493547,
+      "learning_rate": 0.00018106508875739645,
+      "loss": 0.7794,
+      "step": 153
+    },
+    {
+      "epoch": 0.02737777777777778,
+      "grad_norm": 0.4449433292223409,
+      "learning_rate": 0.00018224852071005917,
+      "loss": 0.7892,
+      "step": 154
+    },
+    {
+      "epoch": 0.027555555555555555,
+      "grad_norm": 0.4372015682071353,
+      "learning_rate": 0.00018343195266272192,
+      "loss": 0.7472,
+      "step": 155
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.43865271321098903,
+      "learning_rate": 0.00018461538461538463,
+      "loss": 0.7408,
+      "step": 156
+    },
+    {
+      "epoch": 0.027911111111111112,
+      "grad_norm": 0.45771557496816895,
+      "learning_rate": 0.00018579881656804735,
+      "loss": 0.7857,
+      "step": 157
+    },
+    {
+      "epoch": 0.02808888888888889,
+      "grad_norm": 0.4319156816172843,
+      "learning_rate": 0.00018698224852071007,
+      "loss": 0.7963,
+      "step": 158
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.4525884936465943,
+      "learning_rate": 0.00018816568047337278,
+      "loss": 0.7295,
+      "step": 159
+    },
+    {
+      "epoch": 0.028444444444444446,
+      "grad_norm": 0.39884357895024963,
+      "learning_rate": 0.0001893491124260355,
+      "loss": 0.7267,
+      "step": 160
+    },
+    {
+      "epoch": 0.028622222222222223,
+      "grad_norm": 0.42876394034800636,
+      "learning_rate": 0.00019053254437869822,
+      "loss": 0.7522,
+      "step": 161
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4462597385837125,
+      "learning_rate": 0.00019171597633136096,
+      "loss": 0.7429,
+      "step": 162
+    },
+    {
+      "epoch": 0.02897777777777778,
+      "grad_norm": 0.48819492924245217,
+      "learning_rate": 0.00019289940828402368,
+      "loss": 0.8142,
+      "step": 163
+    },
+    {
+      "epoch": 0.029155555555555556,
+      "grad_norm": 0.47465656582797483,
+      "learning_rate": 0.0001940828402366864,
+      "loss": 0.8171,
+      "step": 164
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.42346675558380087,
+      "learning_rate": 0.00019526627218934911,
+      "loss": 0.7243,
+      "step": 165
+    },
+    {
+      "epoch": 0.02951111111111111,
+      "grad_norm": 0.396978468291565,
+      "learning_rate": 0.00019644970414201186,
+      "loss": 0.726,
+      "step": 166
+    },
+    {
+      "epoch": 0.02968888888888889,
+      "grad_norm": 0.4340506416543326,
+      "learning_rate": 0.00019763313609467458,
+      "loss": 0.7948,
+      "step": 167
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.44736208770371383,
+      "learning_rate": 0.0001988165680473373,
+      "loss": 0.7493,
+      "step": 168
+    },
+    {
+      "epoch": 0.030044444444444443,
+      "grad_norm": 0.44842977629625974,
+      "learning_rate": 0.0002,
+      "loss": 0.7384,
+      "step": 169
+    },
+    {
+      "epoch": 0.030222222222222223,
+      "grad_norm": 0.4377904907138219,
+      "learning_rate": 0.00019999998342242452,
+      "loss": 0.7333,
+      "step": 170
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.4471493498256661,
+      "learning_rate": 0.0001999999336897035,
+      "loss": 0.7827,
+      "step": 171
+    },
+    {
+      "epoch": 0.030577777777777777,
+      "grad_norm": 0.44748572645868845,
+      "learning_rate": 0.0001999998508018535,
+      "loss": 0.7772,
+      "step": 172
+    },
+    {
+      "epoch": 0.030755555555555557,
+      "grad_norm": 0.4096593349436675,
+      "learning_rate": 0.000199999734758902,
+      "loss": 0.7464,
+      "step": 173
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.41697363668435844,
+      "learning_rate": 0.0001999995855608874,
+      "loss": 0.7035,
+      "step": 174
+    },
+    {
+      "epoch": 0.03111111111111111,
+      "grad_norm": 0.4256245750203348,
+      "learning_rate": 0.00019999940320785924,
+      "loss": 0.7817,
+      "step": 175
+    },
+    {
+      "epoch": 0.03128888888888889,
+      "grad_norm": 0.4249773552544686,
+      "learning_rate": 0.00019999918769987796,
+      "loss": 0.7573,
+      "step": 176
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.4310933153219558,
+      "learning_rate": 0.00019999893903701498,
+      "loss": 0.7791,
+      "step": 177
+    },
+    {
+      "epoch": 0.03164444444444445,
+      "grad_norm": 0.4227463269176802,
+      "learning_rate": 0.0001999986572193528,
+      "loss": 0.7597,
+      "step": 178
+    },
+    {
+      "epoch": 0.031822222222222224,
+      "grad_norm": 0.40501337189453185,
+      "learning_rate": 0.0001999983422469848,
+      "loss": 0.7852,
+      "step": 179
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.41866672247549885,
+      "learning_rate": 0.00019999799412001546,
+      "loss": 0.7271,
+      "step": 180
+    },
+    {
+      "epoch": 0.03217777777777778,
+      "grad_norm": 0.4180055203499516,
+      "learning_rate": 0.00019999761283856016,
+      "loss": 0.6782,
+      "step": 181
+    },
+    {
+      "epoch": 0.032355555555555554,
+      "grad_norm": 0.42387302787166425,
+      "learning_rate": 0.00019999719840274534,
+      "loss": 0.7202,
+      "step": 182
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.415592966763154,
+      "learning_rate": 0.0001999967508127084,
+      "loss": 0.7348,
+      "step": 183
+    },
+    {
+      "epoch": 0.032711111111111114,
+      "grad_norm": 0.4231487626538851,
+      "learning_rate": 0.00019999627006859775,
+      "loss": 0.7418,
+      "step": 184
+    },
+    {
+      "epoch": 0.03288888888888889,
+      "grad_norm": 0.43838224153476424,
+      "learning_rate": 0.00019999575617057276,
+      "loss": 0.7352,
+      "step": 185
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.436577132526167,
+      "learning_rate": 0.00019999520911880383,
+      "loss": 0.7903,
+      "step": 186
+    },
+    {
+      "epoch": 0.033244444444444445,
+      "grad_norm": 0.4396741594040606,
+      "learning_rate": 0.00019999462891347235,
+      "loss": 0.7124,
+      "step": 187
+    },
+    {
+      "epoch": 0.03342222222222222,
+      "grad_norm": 0.42165424177243394,
+      "learning_rate": 0.00019999401555477063,
+      "loss": 0.7505,
+      "step": 188
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4216086489577599,
+      "learning_rate": 0.00019999336904290207,
+      "loss": 0.7577,
+      "step": 189
+    },
+    {
+      "epoch": 0.033777777777777775,
+      "grad_norm": 0.4189770102010155,
+      "learning_rate": 0.00019999268937808103,
+      "loss": 0.6997,
+      "step": 190
+    },
+    {
+      "epoch": 0.03395555555555556,
+      "grad_norm": 0.4340902191803675,
+      "learning_rate": 0.00019999197656053288,
+      "loss": 0.7419,
+      "step": 191
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.44127383729299113,
+      "learning_rate": 0.0001999912305904939,
+      "loss": 0.7438,
+      "step": 192
+    },
+    {
+      "epoch": 0.03431111111111111,
+      "grad_norm": 0.4175352351319076,
+      "learning_rate": 0.0001999904514682114,
+      "loss": 0.7364,
+      "step": 193
+    },
+    {
+      "epoch": 0.03448888888888889,
+      "grad_norm": 0.4188165320146356,
+      "learning_rate": 0.00019998963919394376,
+      "loss": 0.7177,
+      "step": 194
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.4387485892400164,
+      "learning_rate": 0.00019998879376796028,
+      "loss": 0.7802,
+      "step": 195
+    },
+    {
+      "epoch": 0.03484444444444444,
+      "grad_norm": 0.4102698316318348,
+      "learning_rate": 0.00019998791519054127,
+      "loss": 0.7439,
+      "step": 196
+    },
+    {
+      "epoch": 0.035022222222222225,
+      "grad_norm": 0.4477879080743453,
+      "learning_rate": 0.00019998700346197796,
+      "loss": 0.7799,
+      "step": 197
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.43547432937232106,
+      "learning_rate": 0.0001999860585825727,
+      "loss": 0.8031,
+      "step": 198
+    },
+    {
+      "epoch": 0.03537777777777778,
+      "grad_norm": 0.40701515352068407,
+      "learning_rate": 0.00019998508055263874,
+      "loss": 0.7322,
+      "step": 199
+    },
+    {
+      "epoch": 0.035555555555555556,
+      "grad_norm": 0.4055655633697622,
+      "learning_rate": 0.00019998406937250034,
+      "loss": 0.7683,
+      "step": 200
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.3941214398957608,
+      "learning_rate": 0.00019998302504249278,
+      "loss": 0.7272,
+      "step": 201
+    },
+    {
+      "epoch": 0.03591111111111111,
+      "grad_norm": 0.4208078679013026,
+      "learning_rate": 0.0001999819475629623,
+      "loss": 0.7143,
+      "step": 202
+    },
+    {
+      "epoch": 0.036088888888888886,
+      "grad_norm": 0.4339666746633097,
+      "learning_rate": 0.00019998083693426616,
+      "loss": 0.752,
+      "step": 203
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.40483388640974854,
+      "learning_rate": 0.00019997969315677252,
+      "loss": 0.6768,
+      "step": 204
+    },
+    {
+      "epoch": 0.036444444444444446,
+      "grad_norm": 0.41543278835837655,
+      "learning_rate": 0.00019997851623086067,
+      "loss": 0.7508,
+      "step": 205
+    },
+    {
+      "epoch": 0.03662222222222222,
+      "grad_norm": 0.4095256756764992,
+      "learning_rate": 0.00019997730615692083,
+      "loss": 0.7039,
+      "step": 206
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4124476756263938,
+      "learning_rate": 0.00019997606293535415,
+      "loss": 0.7822,
+      "step": 207
+    },
+    {
+      "epoch": 0.036977777777777776,
+      "grad_norm": 0.42234578290270847,
+      "learning_rate": 0.00019997478656657287,
+      "loss": 0.7744,
+      "step": 208
+    },
+    {
+      "epoch": 0.03715555555555555,
+      "grad_norm": 0.42601138572474845,
+      "learning_rate": 0.00019997347705100015,
+      "loss": 0.77,
+      "step": 209
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.41680075142207174,
+      "learning_rate": 0.00019997213438907013,
+      "loss": 0.7534,
+      "step": 210
+    },
+    {
+      "epoch": 0.03751111111111111,
+      "grad_norm": 0.42595370919896436,
+      "learning_rate": 0.000199970758581228,
+      "loss": 0.7783,
+      "step": 211
+    },
+    {
+      "epoch": 0.03768888888888889,
+      "grad_norm": 0.41078007123190574,
+      "learning_rate": 0.00019996934962792994,
+      "loss": 0.7515,
+      "step": 212
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.4009652243031692,
+      "learning_rate": 0.00019996790752964305,
+      "loss": 0.7023,
+      "step": 213
+    },
+    {
+      "epoch": 0.03804444444444444,
+      "grad_norm": 0.4293457117206223,
+      "learning_rate": 0.0001999664322868455,
+      "loss": 0.7782,
+      "step": 214
+    },
+    {
+      "epoch": 0.03822222222222222,
+      "grad_norm": 0.4132962337752529,
+      "learning_rate": 0.00019996492390002635,
+      "loss": 0.7793,
+      "step": 215
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.3826851911935954,
+      "learning_rate": 0.00019996338236968574,
+      "loss": 0.7082,
+      "step": 216
+    },
+    {
+      "epoch": 0.03857777777777778,
+      "grad_norm": 0.3920900648286519,
+      "learning_rate": 0.0001999618076963348,
+      "loss": 0.7182,
+      "step": 217
+    },
+    {
+      "epoch": 0.03875555555555556,
+      "grad_norm": 0.42976393460604073,
+      "learning_rate": 0.00019996019988049554,
+      "loss": 0.7998,
+      "step": 218
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.38007212973087107,
+      "learning_rate": 0.0001999585589227011,
+      "loss": 0.689,
+      "step": 219
+    },
+    {
+      "epoch": 0.03911111111111111,
+      "grad_norm": 0.41540261813365553,
+      "learning_rate": 0.00019995688482349553,
+      "loss": 0.7089,
+      "step": 220
+    },
+    {
+      "epoch": 0.03928888888888889,
+      "grad_norm": 0.39321320573098695,
+      "learning_rate": 0.00019995517758343386,
+      "loss": 0.694,
+      "step": 221
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.4037515322280404,
+      "learning_rate": 0.00019995343720308212,
+      "loss": 0.7356,
+      "step": 222
+    },
+    {
+      "epoch": 0.03964444444444445,
+      "grad_norm": 0.42011262289834633,
+      "learning_rate": 0.00019995166368301734,
+      "loss": 0.717,
+      "step": 223
+    },
+    {
+      "epoch": 0.039822222222222224,
+      "grad_norm": 0.42873694748103836,
+      "learning_rate": 0.00019994985702382758,
+      "loss": 0.7633,
+      "step": 224
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.3961804083832185,
+      "learning_rate": 0.00019994801722611182,
+      "loss": 0.6979,
+      "step": 225
+    },
+    {
+      "epoch": 0.04017777777777778,
+      "grad_norm": 0.4172279974998365,
+      "learning_rate": 0.00019994614429047998,
+      "loss": 0.706,
+      "step": 226
+    },
+    {
+      "epoch": 0.040355555555555554,
+      "grad_norm": 0.42688994383848117,
+      "learning_rate": 0.00019994423821755313,
+      "loss": 0.7777,
+      "step": 227
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.4250817717428973,
+      "learning_rate": 0.00019994229900796318,
+      "loss": 0.7273,
+      "step": 228
+    },
+    {
+      "epoch": 0.040711111111111115,
+      "grad_norm": 0.4136562214162894,
+      "learning_rate": 0.00019994032666235308,
+      "loss": 0.7799,
+      "step": 229
+    },
+    {
+      "epoch": 0.04088888888888889,
+      "grad_norm": 0.4214659029090264,
+      "learning_rate": 0.00019993832118137678,
+      "loss": 0.7233,
+      "step": 230
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.40643440979476614,
+      "learning_rate": 0.0001999362825656992,
+      "loss": 0.7083,
+      "step": 231
+    },
+    {
+      "epoch": 0.041244444444444445,
+      "grad_norm": 0.41358356997274226,
+      "learning_rate": 0.0001999342108159962,
+      "loss": 0.7279,
+      "step": 232
+    },
+    {
+      "epoch": 0.04142222222222222,
+      "grad_norm": 0.4319374532419072,
+      "learning_rate": 0.00019993210593295473,
+      "loss": 0.7574,
+      "step": 233
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4307115281367592,
+      "learning_rate": 0.00019992996791727267,
+      "loss": 0.7422,
+      "step": 234
+    },
+    {
+      "epoch": 0.041777777777777775,
+      "grad_norm": 0.4042382399897502,
+      "learning_rate": 0.00019992779676965885,
+      "loss": 0.7391,
+      "step": 235
+    },
+    {
+      "epoch": 0.04195555555555556,
+      "grad_norm": 0.4588002095047031,
+      "learning_rate": 0.0001999255924908331,
+      "loss": 0.7262,
+      "step": 236
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.42467507977251784,
+      "learning_rate": 0.00019992335508152632,
+      "loss": 0.7982,
+      "step": 237
+    },
+    {
+      "epoch": 0.04231111111111111,
+      "grad_norm": 0.4105715522868043,
+      "learning_rate": 0.00019992108454248023,
+      "loss": 0.7324,
+      "step": 238
+    },
+    {
+      "epoch": 0.04248888888888889,
+      "grad_norm": 0.43549057774407846,
+      "learning_rate": 0.00019991878087444772,
+      "loss": 0.7302,
+      "step": 239
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.3885617399762617,
+      "learning_rate": 0.00019991644407819256,
+      "loss": 0.7575,
+      "step": 240
+    },
+    {
+      "epoch": 0.04284444444444444,
+      "grad_norm": 0.408647891930264,
+      "learning_rate": 0.00019991407415448947,
+      "loss": 0.7108,
+      "step": 241
+    },
+    {
+      "epoch": 0.043022222222222226,
+      "grad_norm": 0.40004449768082634,
+      "learning_rate": 0.00019991167110412422,
+      "loss": 0.7199,
+      "step": 242
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4222611035054969,
+      "learning_rate": 0.00019990923492789359,
+      "loss": 0.7717,
+      "step": 243
+    },
+    {
+      "epoch": 0.04337777777777778,
+      "grad_norm": 0.41420806798533705,
+      "learning_rate": 0.00019990676562660524,
+      "loss": 0.778,
+      "step": 244
+    },
+    {
+      "epoch": 0.043555555555555556,
+      "grad_norm": 0.39284232850398737,
+      "learning_rate": 0.00019990426320107792,
+      "loss": 0.7049,
+      "step": 245
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.4041020419763259,
+      "learning_rate": 0.00019990172765214128,
+      "loss": 0.7466,
+      "step": 246
+    },
+    {
+      "epoch": 0.04391111111111111,
+      "grad_norm": 0.4260355274023658,
+      "learning_rate": 0.00019989915898063597,
+      "loss": 0.738,
+      "step": 247
+    },
+    {
+      "epoch": 0.044088888888888886,
+      "grad_norm": 0.4374038023499603,
+      "learning_rate": 0.00019989655718741366,
+      "loss": 0.6973,
+      "step": 248
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.43049231829727097,
+      "learning_rate": 0.000199893922273337,
+      "loss": 0.7792,
+      "step": 249
+    },
+    {
+      "epoch": 0.044444444444444446,
+      "grad_norm": 0.4176026714765864,
+      "learning_rate": 0.00019989125423927956,
+      "loss": 0.6943,
+      "step": 250
+    },
+    {
+      "epoch": 0.04462222222222222,
+      "grad_norm": 0.4059797676151001,
+      "learning_rate": 0.00019988855308612595,
+      "loss": 0.7218,
+      "step": 251
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4158187293131045,
+      "learning_rate": 0.00019988581881477172,
+      "loss": 0.7276,
+      "step": 252
+    },
+    {
+      "epoch": 0.044977777777777776,
+      "grad_norm": 0.40237034388512766,
+      "learning_rate": 0.0001998830514261235,
+      "loss": 0.7587,
+      "step": 253
+    },
+    {
+      "epoch": 0.04515555555555555,
+      "grad_norm": 0.41066420743110094,
+      "learning_rate": 0.0001998802509210987,
+      "loss": 0.7792,
+      "step": 254
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.4076981317466958,
+      "learning_rate": 0.00019987741730062594,
+      "loss": 0.786,
+      "step": 255
+    },
+    {
+      "epoch": 0.04551111111111111,
+      "grad_norm": 0.4103971911062201,
+      "learning_rate": 0.00019987455056564462,
+      "loss": 0.7598,
+      "step": 256
+    },
+    {
+      "epoch": 0.04568888888888889,
+      "grad_norm": 0.41647228585767193,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 0.7664,
+      "step": 257
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.4101745908088802,
+      "learning_rate": 0.00019986871775596937,
+      "loss": 0.8,
+      "step": 258
+    },
+    {
+      "epoch": 0.04604444444444444,
+      "grad_norm": 0.42983877689293504,
+      "learning_rate": 0.00019986575168320925,
+      "loss": 0.6814,
+      "step": 259
+    },
+    {
+      "epoch": 0.04622222222222222,
+      "grad_norm": 0.4057948279672016,
+      "learning_rate": 0.0001998627524998084,
+      "loss": 0.708,
+      "step": 260
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.3991751185185169,
+      "learning_rate": 0.00019985972020676116,
+      "loss": 0.7156,
+      "step": 261
+    },
+    {
+      "epoch": 0.04657777777777778,
+      "grad_norm": 0.41581351090637353,
+      "learning_rate": 0.0001998566548050729,
+      "loss": 0.6909,
+      "step": 262
+    },
+    {
+      "epoch": 0.04675555555555556,
+      "grad_norm": 0.4080322012204585,
+      "learning_rate": 0.00019985355629575997,
+      "loss": 0.7608,
+      "step": 263
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.42162201325708426,
+      "learning_rate": 0.00019985042467984967,
+      "loss": 0.7319,
+      "step": 264
+    },
+    {
+      "epoch": 0.04711111111111111,
+      "grad_norm": 0.39020769390029697,
+      "learning_rate": 0.00019984725995838033,
+      "loss": 0.7121,
+      "step": 265
+    },
+    {
+      "epoch": 0.04728888888888889,
+      "grad_norm": 0.38763501861251853,
+      "learning_rate": 0.00019984406213240113,
+      "loss": 0.74,
+      "step": 266
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.40881310999170967,
+      "learning_rate": 0.0001998408312029724,
+      "loss": 0.7483,
+      "step": 267
+    },
+    {
+      "epoch": 0.04764444444444445,
+      "grad_norm": 0.4768937828755657,
+      "learning_rate": 0.00019983756717116536,
+      "loss": 0.7128,
+      "step": 268
+    },
+    {
+      "epoch": 0.047822222222222224,
+      "grad_norm": 0.3789670433847438,
+      "learning_rate": 0.00019983427003806214,
+      "loss": 0.732,
+      "step": 269
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.40260640659117697,
+      "learning_rate": 0.00019983093980475598,
+      "loss": 0.6963,
+      "step": 270
+    },
+    {
+      "epoch": 0.04817777777777778,
+      "grad_norm": 0.3860090317689809,
+      "learning_rate": 0.00019982757647235094,
+      "loss": 0.7063,
+      "step": 271
+    },
+    {
+      "epoch": 0.048355555555555554,
+      "grad_norm": 0.4326493194592787,
+      "learning_rate": 0.00019982418004196224,
+      "loss": 0.7303,
+      "step": 272
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.40472326762665517,
+      "learning_rate": 0.00019982075051471588,
+      "loss": 0.747,
+      "step": 273
+    },
+    {
+      "epoch": 0.04871111111111111,
+      "grad_norm": 0.41569159761813573,
+      "learning_rate": 0.000199817287891749,
+      "loss": 0.7279,
+      "step": 274
+    },
+    {
+      "epoch": 0.04888888888888889,
+      "grad_norm": 0.3965281745477917,
+      "learning_rate": 0.00019981379217420958,
+      "loss": 0.7107,
+      "step": 275
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.403196072301094,
+      "learning_rate": 0.00019981026336325663,
+      "loss": 0.7621,
+      "step": 276
+    },
+    {
+      "epoch": 0.049244444444444445,
+      "grad_norm": 0.41927919266095026,
+      "learning_rate": 0.0001998067014600602,
+      "loss": 0.7661,
+      "step": 277
+    },
+    {
+      "epoch": 0.04942222222222222,
+      "grad_norm": 0.37584221032027115,
+      "learning_rate": 0.00019980310646580115,
+      "loss": 0.6696,
+      "step": 278
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.42389289080408377,
+      "learning_rate": 0.0001997994783816715,
+      "loss": 0.7832,
+      "step": 279
+    },
+    {
+      "epoch": 0.049777777777777775,
+      "grad_norm": 0.38512553433136487,
+      "learning_rate": 0.0001997958172088741,
+      "loss": 0.7167,
+      "step": 280
+    },
+    {
+      "epoch": 0.04995555555555556,
+      "grad_norm": 0.3859382813533775,
+      "learning_rate": 0.0001997921229486228,
+      "loss": 0.7171,
+      "step": 281
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.4158323653798766,
+      "learning_rate": 0.00019978839560214247,
+      "loss": 0.7713,
+      "step": 282
+    },
+    {
+      "epoch": 0.05031111111111111,
+      "grad_norm": 0.38860124681480823,
+      "learning_rate": 0.00019978463517066888,
+      "loss": 0.727,
+      "step": 283
+    },
+    {
+      "epoch": 0.05048888888888889,
+      "grad_norm": 0.40108690695421373,
+      "learning_rate": 0.00019978084165544883,
+      "loss": 0.7273,
+      "step": 284
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.415268620449402,
+      "learning_rate": 0.0001997770150577401,
+      "loss": 0.7439,
+      "step": 285
+    },
+    {
+      "epoch": 0.05084444444444444,
+      "grad_norm": 0.4282600774927427,
+      "learning_rate": 0.00019977315537881137,
+      "loss": 0.8217,
+      "step": 286
+    },
+    {
+      "epoch": 0.05102222222222222,
+      "grad_norm": 0.3961562991634904,
+      "learning_rate": 0.0001997692626199423,
+      "loss": 0.7108,
+      "step": 287
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4064635607969968,
+      "learning_rate": 0.00019976533678242359,
+      "loss": 0.702,
+      "step": 288
+    },
+    {
+      "epoch": 0.05137777777777778,
+      "grad_norm": 0.42569958908513883,
+      "learning_rate": 0.0001997613778675568,
+      "loss": 0.7546,
+      "step": 289
+    },
+    {
+      "epoch": 0.051555555555555556,
+      "grad_norm": 0.4269333671719804,
+      "learning_rate": 0.00019975738587665456,
+      "loss": 0.7386,
+      "step": 290
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.41759105568698235,
+      "learning_rate": 0.00019975336081104038,
+      "loss": 0.7442,
+      "step": 291
+    },
+    {
+      "epoch": 0.05191111111111111,
+      "grad_norm": 0.45511212746981194,
+      "learning_rate": 0.00019974930267204884,
+      "loss": 0.7446,
+      "step": 292
+    },
+    {
+      "epoch": 0.052088888888888886,
+      "grad_norm": 0.3927148245282799,
+      "learning_rate": 0.00019974521146102537,
+      "loss": 0.7311,
+      "step": 293
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.4345782139011403,
+      "learning_rate": 0.00019974108717932642,
+      "loss": 0.8004,
+      "step": 294
+    },
+    {
+      "epoch": 0.052444444444444446,
+      "grad_norm": 0.3956155061076544,
+      "learning_rate": 0.00019973692982831943,
+      "loss": 0.7173,
+      "step": 295
+    },
+    {
+      "epoch": 0.05262222222222222,
+      "grad_norm": 0.3860379553602568,
+      "learning_rate": 0.00019973273940938275,
+      "loss": 0.7478,
+      "step": 296
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.40023460444830405,
+      "learning_rate": 0.00019972851592390574,
+      "loss": 0.7539,
+      "step": 297
+    },
+    {
+      "epoch": 0.052977777777777776,
+      "grad_norm": 0.3857506700815517,
+      "learning_rate": 0.0001997242593732887,
+      "loss": 0.7852,
+      "step": 298
+    },
+    {
+      "epoch": 0.05315555555555555,
+      "grad_norm": 0.40928687740427483,
+      "learning_rate": 0.00019971996975894286,
+      "loss": 0.8074,
+      "step": 299
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.39557341515474426,
+      "learning_rate": 0.00019971564708229047,
+      "loss": 0.7319,
+      "step": 300
+    },
+    {
+      "epoch": 0.05351111111111111,
+      "grad_norm": 0.3987251405952196,
+      "learning_rate": 0.00019971129134476473,
+      "loss": 0.7554,
+      "step": 301
+    },
+    {
+      "epoch": 0.05368888888888889,
+      "grad_norm": 0.3864495178514433,
+      "learning_rate": 0.0001997069025478098,
+      "loss": 0.6832,
+      "step": 302
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.427056921901087,
+      "learning_rate": 0.0001997024806928808,
+      "loss": 0.7821,
+      "step": 303
+    },
+    {
+      "epoch": 0.054044444444444444,
+      "grad_norm": 0.37317816435074624,
+      "learning_rate": 0.00019969802578144376,
+      "loss": 0.6802,
+      "step": 304
+    },
+    {
+      "epoch": 0.05422222222222222,
+      "grad_norm": 0.4078039396305031,
+      "learning_rate": 0.00019969353781497574,
+      "loss": 0.7069,
+      "step": 305
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3943567927022663,
+      "learning_rate": 0.00019968901679496472,
+      "loss": 0.6958,
+      "step": 306
+    },
+    {
+      "epoch": 0.05457777777777778,
+      "grad_norm": 0.42094678424076726,
+      "learning_rate": 0.00019968446272290968,
+      "loss": 0.7056,
+      "step": 307
+    },
+    {
+      "epoch": 0.05475555555555556,
+      "grad_norm": 0.5245417072250762,
+      "learning_rate": 0.0001996798756003205,
+      "loss": 0.7259,
+      "step": 308
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.41411973399556323,
+      "learning_rate": 0.00019967525542871804,
+      "loss": 0.755,
+      "step": 309
+    },
+    {
+      "epoch": 0.05511111111111111,
+      "grad_norm": 0.3972680177444909,
+      "learning_rate": 0.00019967060220963415,
+      "loss": 0.7137,
+      "step": 310
+    },
+    {
+      "epoch": 0.05528888888888889,
+      "grad_norm": 0.41521443791808565,
+      "learning_rate": 0.00019966591594461157,
+      "loss": 0.7777,
+      "step": 311
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.3916336077331353,
+      "learning_rate": 0.00019966119663520412,
+      "loss": 0.6849,
+      "step": 312
+    },
+    {
+      "epoch": 0.05564444444444445,
+      "grad_norm": 0.42052246462401904,
+      "learning_rate": 0.00019965644428297642,
+      "loss": 0.7569,
+      "step": 313
+    },
+    {
+      "epoch": 0.055822222222222224,
+      "grad_norm": 0.4110369457873654,
+      "learning_rate": 0.00019965165888950414,
+      "loss": 0.6966,
+      "step": 314
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4041355832408566,
+      "learning_rate": 0.00019964684045637387,
+      "loss": 0.7634,
+      "step": 315
+    },
+    {
+      "epoch": 0.05617777777777778,
+      "grad_norm": 0.40942487697009206,
+      "learning_rate": 0.00019964198898518324,
+      "loss": 0.6984,
+      "step": 316
+    },
+    {
+      "epoch": 0.056355555555555555,
+      "grad_norm": 0.4027425524934776,
+      "learning_rate": 0.00019963710447754065,
+      "loss": 0.694,
+      "step": 317
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.40719980235720055,
+      "learning_rate": 0.00019963218693506564,
+      "loss": 0.6956,
+      "step": 318
+    },
+    {
+      "epoch": 0.05671111111111111,
+      "grad_norm": 0.398804474465064,
+      "learning_rate": 0.00019962723635938865,
+      "loss": 0.7416,
+      "step": 319
+    },
+    {
+      "epoch": 0.05688888888888889,
+      "grad_norm": 0.41684825603507647,
+      "learning_rate": 0.000199622252752151,
+      "loss": 0.7722,
+      "step": 320
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.39520916999673306,
+      "learning_rate": 0.000199617236115005,
+      "loss": 0.7179,
+      "step": 321
+    },
+    {
+      "epoch": 0.057244444444444445,
+      "grad_norm": 0.3985887905211269,
+      "learning_rate": 0.00019961218644961397,
+      "loss": 0.6911,
+      "step": 322
+    },
+    {
+      "epoch": 0.05742222222222222,
+      "grad_norm": 0.41078307916742446,
+      "learning_rate": 0.0001996071037576521,
+      "loss": 0.7297,
+      "step": 323
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4003687552320492,
+      "learning_rate": 0.0001996019880408046,
+      "loss": 0.6995,
+      "step": 324
+    },
+    {
+      "epoch": 0.057777777777777775,
+      "grad_norm": 0.41374382086838146,
+      "learning_rate": 0.00019959683930076758,
+      "loss": 0.6852,
+      "step": 325
+    },
+    {
+      "epoch": 0.05795555555555556,
+      "grad_norm": 0.38767911144070144,
+      "learning_rate": 0.00019959165753924806,
+      "loss": 0.6603,
+      "step": 326
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.4146713131575808,
+      "learning_rate": 0.00019958644275796416,
+      "loss": 0.7482,
+      "step": 327
+    },
+    {
+      "epoch": 0.05831111111111111,
+      "grad_norm": 0.40282409427512333,
+      "learning_rate": 0.00019958119495864477,
+      "loss": 0.7503,
+      "step": 328
+    },
+    {
+      "epoch": 0.05848888888888889,
+      "grad_norm": 0.42306846587640345,
+      "learning_rate": 0.00019957591414302984,
+      "loss": 0.6998,
+      "step": 329
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.4277605462336533,
+      "learning_rate": 0.0001995706003128702,
+      "loss": 0.7508,
+      "step": 330
+    },
+    {
+      "epoch": 0.05884444444444444,
+      "grad_norm": 0.3853085682494387,
+      "learning_rate": 0.00019956525346992768,
+      "loss": 0.6927,
+      "step": 331
+    },
+    {
+      "epoch": 0.05902222222222222,
+      "grad_norm": 0.391673017668163,
+      "learning_rate": 0.00019955987361597506,
+      "loss": 0.7491,
+      "step": 332
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.41754786645076236,
+      "learning_rate": 0.000199554460752796,
+      "loss": 0.8008,
+      "step": 333
+    },
+    {
+      "epoch": 0.05937777777777778,
+      "grad_norm": 0.3849602871243546,
+      "learning_rate": 0.00019954901488218515,
+      "loss": 0.7321,
+      "step": 334
+    },
+    {
+      "epoch": 0.059555555555555556,
+      "grad_norm": 0.3950166024985762,
+      "learning_rate": 0.00019954353600594812,
+      "loss": 0.7144,
+      "step": 335
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.4140578900735401,
+      "learning_rate": 0.00019953802412590142,
+      "loss": 0.709,
+      "step": 336
+    },
+    {
+      "epoch": 0.05991111111111111,
+      "grad_norm": 0.3870858564154028,
+      "learning_rate": 0.00019953247924387252,
+      "loss": 0.7185,
+      "step": 337
+    },
+    {
+      "epoch": 0.060088888888888886,
+      "grad_norm": 0.41320328866404843,
+      "learning_rate": 0.00019952690136169985,
+      "loss": 0.738,
+      "step": 338
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.44243060101910286,
+      "learning_rate": 0.00019952129048123274,
+      "loss": 0.7415,
+      "step": 339
+    },
+    {
+      "epoch": 0.060444444444444446,
+      "grad_norm": 0.42840659815007515,
+      "learning_rate": 0.0001995156466043315,
+      "loss": 0.7424,
+      "step": 340
+    },
+    {
+      "epoch": 0.06062222222222222,
+      "grad_norm": 0.39827708227458203,
+      "learning_rate": 0.0001995099697328674,
+      "loss": 0.7403,
+      "step": 341
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3955191883883136,
+      "learning_rate": 0.00019950425986872255,
+      "loss": 0.7187,
+      "step": 342
+    },
+    {
+      "epoch": 0.06097777777777778,
+      "grad_norm": 0.4226085872385566,
+      "learning_rate": 0.0001994985170137901,
+      "loss": 0.7091,
+      "step": 343
+    },
+    {
+      "epoch": 0.06115555555555555,
+      "grad_norm": 0.3847284881018638,
+      "learning_rate": 0.00019949274116997406,
+      "loss": 0.7337,
+      "step": 344
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.41103657174701047,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 0.7142,
+      "step": 345
+    },
+    {
+      "epoch": 0.061511111111111114,
+      "grad_norm": 0.39510712866799935,
+      "learning_rate": 0.00019948109052336232,
+      "loss": 0.6515,
+      "step": 346
+    },
+    {
+      "epoch": 0.06168888888888889,
+      "grad_norm": 0.3973478546306339,
+      "learning_rate": 0.00019947521572442935,
+      "loss": 0.7053,
+      "step": 347
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.39265961824794854,
+      "learning_rate": 0.0001994693079443384,
+      "loss": 0.7317,
+      "step": 348
+    },
+    {
+      "epoch": 0.062044444444444444,
+      "grad_norm": 0.40022808878239136,
+      "learning_rate": 0.00019946336718504822,
+      "loss": 0.7467,
+      "step": 349
+    },
+    {
+      "epoch": 0.06222222222222222,
+      "grad_norm": 0.4060392673229292,
+      "learning_rate": 0.00019945739344852848,
+      "loss": 0.691,
+      "step": 350
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.38163889363721626,
+      "learning_rate": 0.00019945138673675973,
+      "loss": 0.7025,
+      "step": 351
+    },
+    {
+      "epoch": 0.06257777777777777,
+      "grad_norm": 0.3848990756785906,
+      "learning_rate": 0.00019944534705173354,
+      "loss": 0.7874,
+      "step": 352
+    },
+    {
+      "epoch": 0.06275555555555555,
+      "grad_norm": 0.40265559565058934,
+      "learning_rate": 0.00019943927439545242,
+      "loss": 0.6937,
+      "step": 353
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.3924397444910792,
+      "learning_rate": 0.0001994331687699297,
+      "loss": 0.741,
+      "step": 354
+    },
+    {
+      "epoch": 0.06311111111111112,
+      "grad_norm": 0.37538364201713864,
+      "learning_rate": 0.00019942703017718975,
+      "loss": 0.7035,
+      "step": 355
+    },
+    {
+      "epoch": 0.0632888888888889,
+      "grad_norm": 0.38866448405000675,
+      "learning_rate": 0.0001994208586192678,
+      "loss": 0.6894,
+      "step": 356
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.3903914056587825,
+      "learning_rate": 0.00019941465409821008,
+      "loss": 0.6418,
+      "step": 357
+    },
+    {
+      "epoch": 0.06364444444444445,
+      "grad_norm": 0.43251756600847535,
+      "learning_rate": 0.00019940841661607366,
+      "loss": 0.7657,
+      "step": 358
+    },
+    {
+      "epoch": 0.06382222222222222,
+      "grad_norm": 0.40484522087785463,
+      "learning_rate": 0.0001994021461749266,
+      "loss": 0.7733,
+      "step": 359
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.41265312921015945,
+      "learning_rate": 0.0001993958427768479,
+      "loss": 0.7472,
+      "step": 360
+    },
+    {
+      "epoch": 0.06417777777777778,
+      "grad_norm": 0.42083144145833196,
+      "learning_rate": 0.00019938950642392746,
+      "loss": 0.7835,
+      "step": 361
+    },
+    {
+      "epoch": 0.06435555555555555,
+      "grad_norm": 0.38595338979089616,
+      "learning_rate": 0.0001993831371182661,
+      "loss": 0.6785,
+      "step": 362
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.3908370405376001,
+      "learning_rate": 0.00019937673486197555,
+      "loss": 0.705,
+      "step": 363
+    },
+    {
+      "epoch": 0.06471111111111111,
+      "grad_norm": 0.3893853168603376,
+      "learning_rate": 0.0001993702996571785,
+      "loss": 0.7022,
+      "step": 364
+    },
+    {
+      "epoch": 0.06488888888888888,
+      "grad_norm": 0.3883852863717747,
+      "learning_rate": 0.00019936383150600856,
+      "loss": 0.6847,
+      "step": 365
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.4089026341869207,
+      "learning_rate": 0.00019935733041061027,
+      "loss": 0.7262,
+      "step": 366
+    },
+    {
+      "epoch": 0.06524444444444444,
+      "grad_norm": 0.3987094795708502,
+      "learning_rate": 0.00019935079637313906,
+      "loss": 0.7346,
+      "step": 367
+    },
+    {
+      "epoch": 0.06542222222222223,
+      "grad_norm": 0.4165262629169865,
+      "learning_rate": 0.00019934422939576124,
+      "loss": 0.717,
+      "step": 368
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.38509780963736684,
+      "learning_rate": 0.0001993376294806542,
+      "loss": 0.7156,
+      "step": 369
+    },
+    {
+      "epoch": 0.06577777777777778,
+      "grad_norm": 0.3814004299512465,
+      "learning_rate": 0.00019933099663000615,
+      "loss": 0.7497,
+      "step": 370
+    },
+    {
+      "epoch": 0.06595555555555556,
+      "grad_norm": 0.4238705344332552,
+      "learning_rate": 0.00019932433084601613,
+      "loss": 0.7415,
+      "step": 371
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.3989956611610803,
+      "learning_rate": 0.00019931763213089428,
+      "loss": 0.6942,
+      "step": 372
+    },
+    {
+      "epoch": 0.06631111111111111,
+      "grad_norm": 0.3907659131921682,
+      "learning_rate": 0.00019931090048686152,
+      "loss": 0.7014,
+      "step": 373
+    },
+    {
+      "epoch": 0.06648888888888889,
+      "grad_norm": 0.41059339211380236,
+      "learning_rate": 0.00019930413591614973,
+      "loss": 0.7995,
+      "step": 374
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.38682872150951947,
+      "learning_rate": 0.00019929733842100178,
+      "loss": 0.7056,
+      "step": 375
+    },
+    {
+      "epoch": 0.06684444444444444,
+      "grad_norm": 0.38589179904857857,
+      "learning_rate": 0.0001992905080036713,
+      "loss": 0.703,
+      "step": 376
+    },
+    {
+      "epoch": 0.06702222222222222,
+      "grad_norm": 0.38276868000179953,
+      "learning_rate": 0.000199283644666423,
+      "loss": 0.7129,
+      "step": 377
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.390499334097345,
+      "learning_rate": 0.00019927674841153237,
+      "loss": 0.7303,
+      "step": 378
+    },
+    {
+      "epoch": 0.06737777777777777,
+      "grad_norm": 0.39012598805979254,
+      "learning_rate": 0.00019926981924128594,
+      "loss": 0.7514,
+      "step": 379
+    },
+    {
+      "epoch": 0.06755555555555555,
+      "grad_norm": 0.41551698281869964,
+      "learning_rate": 0.000199262857157981,
+      "loss": 0.7134,
+      "step": 380
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.4005173535103667,
+      "learning_rate": 0.00019925586216392596,
+      "loss": 0.6885,
+      "step": 381
+    },
+    {
+      "epoch": 0.06791111111111112,
+      "grad_norm": 0.4399692154042501,
+      "learning_rate": 0.0001992488342614399,
+      "loss": 0.7621,
+      "step": 382
+    },
+    {
+      "epoch": 0.0680888888888889,
+      "grad_norm": 0.41808656118741233,
+      "learning_rate": 0.00019924177345285297,
+      "loss": 0.7132,
+      "step": 383
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.40175559844683045,
+      "learning_rate": 0.00019923467974050622,
+      "loss": 0.7174,
+      "step": 384
+    },
+    {
+      "epoch": 0.06844444444444445,
+      "grad_norm": 0.39959976325332075,
+      "learning_rate": 0.00019922755312675158,
+      "loss": 0.6822,
+      "step": 385
+    },
+    {
+      "epoch": 0.06862222222222222,
+      "grad_norm": 0.4103944462406291,
+      "learning_rate": 0.00019922039361395185,
+      "loss": 0.711,
+      "step": 386
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4316320714005958,
+      "learning_rate": 0.00019921320120448082,
+      "loss": 0.6684,
+      "step": 387
+    },
+    {
+      "epoch": 0.06897777777777778,
+      "grad_norm": 0.4087047790614361,
+      "learning_rate": 0.00019920597590072312,
+      "loss": 0.7171,
+      "step": 388
+    },
+    {
+      "epoch": 0.06915555555555555,
+      "grad_norm": 0.4121802353249436,
+      "learning_rate": 0.0001991987177050743,
+      "loss": 0.7022,
+      "step": 389
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.4057843281805508,
+      "learning_rate": 0.00019919142661994088,
+      "loss": 0.7377,
+      "step": 390
+    },
+    {
+      "epoch": 0.0695111111111111,
+      "grad_norm": 0.4324857831612135,
+      "learning_rate": 0.00019918410264774017,
+      "loss": 0.7145,
+      "step": 391
+    },
+    {
+      "epoch": 0.06968888888888888,
+      "grad_norm": 0.3964608162666485,
+      "learning_rate": 0.00019917674579090044,
+      "loss": 0.7708,
+      "step": 392
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.3991648435068707,
+      "learning_rate": 0.00019916935605186092,
+      "loss": 0.7295,
+      "step": 393
+    },
+    {
+      "epoch": 0.07004444444444445,
+      "grad_norm": 0.38554816477711273,
+      "learning_rate": 0.00019916193343307167,
+      "loss": 0.7213,
+      "step": 394
+    },
+    {
+      "epoch": 0.07022222222222223,
+      "grad_norm": 0.39011803598670747,
+      "learning_rate": 0.00019915447793699364,
+      "loss": 0.7541,
+      "step": 395
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.38368000820172654,
+      "learning_rate": 0.00019914698956609875,
+      "loss": 0.6776,
+      "step": 396
+    },
+    {
+      "epoch": 0.07057777777777778,
+      "grad_norm": 0.3949702277822226,
+      "learning_rate": 0.00019913946832286975,
+      "loss": 0.7903,
+      "step": 397
+    },
+    {
+      "epoch": 0.07075555555555556,
+      "grad_norm": 0.3927135416997767,
+      "learning_rate": 0.00019913191420980033,
+      "loss": 0.7177,
+      "step": 398
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.37457130403302863,
+      "learning_rate": 0.0001991243272293951,
+      "loss": 0.6492,
+      "step": 399
+    },
+    {
+      "epoch": 0.07111111111111111,
+      "grad_norm": 0.39570709246601554,
+      "learning_rate": 0.00019911670738416947,
+      "loss": 0.7204,
+      "step": 400
+    },
+    {
+      "epoch": 0.07128888888888889,
+      "grad_norm": 0.3960753830839489,
+      "learning_rate": 0.00019910905467664987,
+      "loss": 0.7064,
+      "step": 401
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.36061511883345115,
+      "learning_rate": 0.00019910136910937355,
+      "loss": 0.6778,
+      "step": 402
+    },
+    {
+      "epoch": 0.07164444444444444,
+      "grad_norm": 0.3871093380105151,
+      "learning_rate": 0.00019909365068488863,
+      "loss": 0.6609,
+      "step": 403
+    },
+    {
+      "epoch": 0.07182222222222222,
+      "grad_norm": 0.3783251036727788,
+      "learning_rate": 0.00019908589940575424,
+      "loss": 0.6438,
+      "step": 404
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.37978204780797625,
+      "learning_rate": 0.0001990781152745403,
+      "loss": 0.6639,
+      "step": 405
+    },
+    {
+      "epoch": 0.07217777777777777,
+      "grad_norm": 0.4093861988747356,
+      "learning_rate": 0.00019907029829382758,
+      "loss": 0.7211,
+      "step": 406
+    },
+    {
+      "epoch": 0.07235555555555556,
+      "grad_norm": 0.4047677468631361,
+      "learning_rate": 0.0001990624484662079,
+      "loss": 0.6769,
+      "step": 407
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.4070877326042871,
+      "learning_rate": 0.00019905456579428384,
+      "loss": 0.7264,
+      "step": 408
+    },
+    {
+      "epoch": 0.07271111111111112,
+      "grad_norm": 0.4071589982711702,
+      "learning_rate": 0.00019904665028066894,
+      "loss": 0.7118,
+      "step": 409
+    },
+    {
+      "epoch": 0.07288888888888889,
+      "grad_norm": 0.3874607524608504,
+      "learning_rate": 0.00019903870192798762,
+      "loss": 0.6854,
+      "step": 410
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.3784816688798273,
+      "learning_rate": 0.00019903072073887507,
+      "loss": 0.6938,
+      "step": 411
+    },
+    {
+      "epoch": 0.07324444444444445,
+      "grad_norm": 0.38133597104549394,
+      "learning_rate": 0.00019902270671597757,
+      "loss": 0.7136,
+      "step": 412
+    },
+    {
+      "epoch": 0.07342222222222222,
+      "grad_norm": 0.3969531446889939,
+      "learning_rate": 0.00019901465986195212,
+      "loss": 0.6739,
+      "step": 413
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4001476164167259,
+      "learning_rate": 0.00019900658017946672,
+      "loss": 0.7022,
+      "step": 414
+    },
+    {
+      "epoch": 0.07377777777777778,
+      "grad_norm": 0.3839686474510531,
+      "learning_rate": 0.00019899846767120014,
+      "loss": 0.7036,
+      "step": 415
+    },
+    {
+      "epoch": 0.07395555555555555,
+      "grad_norm": 0.40393162454141374,
+      "learning_rate": 0.00019899032233984215,
+      "loss": 0.7466,
+      "step": 416
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.41546775914410533,
+      "learning_rate": 0.0001989821441880933,
+      "loss": 0.6945,
+      "step": 417
+    },
+    {
+      "epoch": 0.0743111111111111,
+      "grad_norm": 0.4024131794448322,
+      "learning_rate": 0.00019897393321866507,
+      "loss": 0.7575,
+      "step": 418
+    },
+    {
+      "epoch": 0.07448888888888888,
+      "grad_norm": 0.43695009201329216,
+      "learning_rate": 0.00019896568943427988,
+      "loss": 0.8218,
+      "step": 419
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.37219480229165597,
+      "learning_rate": 0.00019895741283767085,
+      "loss": 0.683,
+      "step": 420
+    },
+    {
+      "epoch": 0.07484444444444445,
+      "grad_norm": 0.3963728867326916,
+      "learning_rate": 0.00019894910343158225,
+      "loss": 0.7472,
+      "step": 421
+    },
+    {
+      "epoch": 0.07502222222222223,
+      "grad_norm": 0.401321228712814,
+      "learning_rate": 0.000198940761218769,
+      "loss": 0.6606,
+      "step": 422
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4166752215215881,
+      "learning_rate": 0.00019893238620199692,
+      "loss": 0.718,
+      "step": 423
+    },
+    {
+      "epoch": 0.07537777777777778,
+      "grad_norm": 0.4082404968393422,
+      "learning_rate": 0.00019892397838404286,
+      "loss": 0.684,
+      "step": 424
+    },
+    {
+      "epoch": 0.07555555555555556,
+      "grad_norm": 0.4002102640032932,
+      "learning_rate": 0.0001989155377676944,
+      "loss": 0.7701,
+      "step": 425
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.3962086195289424,
+      "learning_rate": 0.00019890706435574996,
+      "loss": 0.7124,
+      "step": 426
+    },
+    {
+      "epoch": 0.07591111111111111,
+      "grad_norm": 0.376472674399961,
+      "learning_rate": 0.0001988985581510191,
+      "loss": 0.7016,
+      "step": 427
+    },
+    {
+      "epoch": 0.07608888888888889,
+      "grad_norm": 0.3815542765291551,
+      "learning_rate": 0.0001988900191563219,
+      "loss": 0.6804,
+      "step": 428
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.39731788096853865,
+      "learning_rate": 0.00019888144737448951,
+      "loss": 0.7337,
+      "step": 429
+    },
+    {
+      "epoch": 0.07644444444444444,
+      "grad_norm": 0.40619310393560626,
+      "learning_rate": 0.00019887284280836398,
+      "loss": 0.6892,
+      "step": 430
+    },
+    {
+      "epoch": 0.07662222222222222,
+      "grad_norm": 0.4705324077268298,
+      "learning_rate": 0.0001988642054607981,
+      "loss": 0.7338,
+      "step": 431
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.39482001590724675,
+      "learning_rate": 0.00019885553533465565,
+      "loss": 0.7368,
+      "step": 432
+    },
+    {
+      "epoch": 0.07697777777777778,
+      "grad_norm": 0.4141309101913045,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 0.6901,
+      "step": 433
+    },
+    {
+      "epoch": 0.07715555555555556,
+      "grad_norm": 0.38605885392346095,
+      "learning_rate": 0.00019883809675815014,
+      "loss": 0.7277,
+      "step": 434
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.39914498904299706,
+      "learning_rate": 0.00019882932831356888,
+      "loss": 0.6728,
+      "step": 435
+    },
+    {
+      "epoch": 0.07751111111111111,
+      "grad_norm": 0.6338942533284905,
+      "learning_rate": 0.00019882052710197461,
+      "loss": 0.7376,
+      "step": 436
+    },
+    {
+      "epoch": 0.07768888888888889,
+      "grad_norm": 0.40094631631964806,
+      "learning_rate": 0.0001988116931262854,
+      "loss": 0.7385,
+      "step": 437
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.416096651605623,
+      "learning_rate": 0.0001988028263894301,
+      "loss": 0.741,
+      "step": 438
+    },
+    {
+      "epoch": 0.07804444444444444,
+      "grad_norm": 0.3996699967882852,
+      "learning_rate": 0.00019879392689434852,
+      "loss": 0.708,
+      "step": 439
+    },
+    {
+      "epoch": 0.07822222222222222,
+      "grad_norm": 0.3986240266716566,
+      "learning_rate": 0.0001987849946439913,
+      "loss": 0.7422,
+      "step": 440
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.3837916756105452,
+      "learning_rate": 0.00019877602964131995,
+      "loss": 0.6942,
+      "step": 441
+    },
+    {
+      "epoch": 0.07857777777777777,
+      "grad_norm": 0.4106748207779132,
+      "learning_rate": 0.00019876703188930684,
+      "loss": 0.8022,
+      "step": 442
+    },
+    {
+      "epoch": 0.07875555555555555,
+      "grad_norm": 0.3867554407368537,
+      "learning_rate": 0.0001987580013909352,
+      "loss": 0.7392,
+      "step": 443
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.3899890604107621,
+      "learning_rate": 0.00019874893814919906,
+      "loss": 0.7282,
+      "step": 444
+    },
+    {
+      "epoch": 0.0791111111111111,
+      "grad_norm": 0.3909450016766374,
+      "learning_rate": 0.00019873984216710336,
+      "loss": 0.727,
+      "step": 445
+    },
+    {
+      "epoch": 0.0792888888888889,
+      "grad_norm": 0.3896511609914778,
+      "learning_rate": 0.0001987307134476639,
+      "loss": 0.7288,
+      "step": 446
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.37251349168789144,
+      "learning_rate": 0.00019872155199390733,
+      "loss": 0.6931,
+      "step": 447
+    },
+    {
+      "epoch": 0.07964444444444445,
+      "grad_norm": 0.3777509520845078,
+      "learning_rate": 0.00019871235780887113,
+      "loss": 0.7308,
+      "step": 448
+    },
+    {
+      "epoch": 0.07982222222222222,
+      "grad_norm": 0.36277517948573573,
+      "learning_rate": 0.00019870313089560365,
+      "loss": 0.6848,
+      "step": 449
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3919062953118981,
+      "learning_rate": 0.00019869387125716407,
+      "loss": 0.6851,
+      "step": 450
+    },
+    {
+      "epoch": 0.08017777777777778,
+      "grad_norm": 0.38761819854795565,
+      "learning_rate": 0.00019868457889662248,
+      "loss": 0.6781,
+      "step": 451
+    },
+    {
+      "epoch": 0.08035555555555556,
+      "grad_norm": 0.3788510612813003,
+      "learning_rate": 0.00019867525381705973,
+      "loss": 0.7273,
+      "step": 452
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.39016542366868956,
+      "learning_rate": 0.0001986658960215676,
+      "loss": 0.765,
+      "step": 453
+    },
+    {
+      "epoch": 0.08071111111111111,
+      "grad_norm": 0.4266466801889542,
+      "learning_rate": 0.00019865650551324866,
+      "loss": 0.7666,
+      "step": 454
+    },
+    {
+      "epoch": 0.08088888888888889,
+      "grad_norm": 0.3834137034047898,
+      "learning_rate": 0.00019864708229521636,
+      "loss": 0.6838,
+      "step": 455
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.3836031334531388,
+      "learning_rate": 0.00019863762637059495,
+      "loss": 0.669,
+      "step": 456
+    },
+    {
+      "epoch": 0.08124444444444444,
+      "grad_norm": 0.3753655698179046,
+      "learning_rate": 0.0001986281377425196,
+      "loss": 0.7294,
+      "step": 457
+    },
+    {
+      "epoch": 0.08142222222222223,
+      "grad_norm": 0.3699776675881974,
+      "learning_rate": 0.00019861861641413625,
+      "loss": 0.6958,
+      "step": 458
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.40947612279335743,
+      "learning_rate": 0.0001986090623886017,
+      "loss": 0.777,
+      "step": 459
+    },
+    {
+      "epoch": 0.08177777777777778,
+      "grad_norm": 0.3789962009920692,
+      "learning_rate": 0.00019859947566908364,
+      "loss": 0.7214,
+      "step": 460
+    },
+    {
+      "epoch": 0.08195555555555556,
+      "grad_norm": 0.36659064268536445,
+      "learning_rate": 0.00019858985625876056,
+      "loss": 0.7546,
+      "step": 461
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.41380546777912214,
+      "learning_rate": 0.00019858020416082178,
+      "loss": 0.7473,
+      "step": 462
+    },
+    {
+      "epoch": 0.08231111111111111,
+      "grad_norm": 0.38297506018788485,
+      "learning_rate": 0.00019857051937846744,
+      "loss": 0.7117,
+      "step": 463
+    },
+    {
+      "epoch": 0.08248888888888889,
+      "grad_norm": 0.3786590318115611,
+      "learning_rate": 0.00019856080191490858,
+      "loss": 0.7133,
+      "step": 464
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.39191032864838593,
+      "learning_rate": 0.00019855105177336702,
+      "loss": 0.6793,
+      "step": 465
+    },
+    {
+      "epoch": 0.08284444444444444,
+      "grad_norm": 0.40403356412990415,
+      "learning_rate": 0.0001985412689570754,
+      "loss": 0.674,
+      "step": 466
+    },
+    {
+      "epoch": 0.08302222222222222,
+      "grad_norm": 0.42610295285771455,
+      "learning_rate": 0.00019853145346927732,
+      "loss": 0.7042,
+      "step": 467
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3781768741886325,
+      "learning_rate": 0.00019852160531322707,
+      "loss": 0.6943,
+      "step": 468
+    },
+    {
+      "epoch": 0.08337777777777777,
+      "grad_norm": 0.41533041027846673,
+      "learning_rate": 0.00019851172449218978,
+      "loss": 0.6883,
+      "step": 469
+    },
+    {
+      "epoch": 0.08355555555555555,
+      "grad_norm": 0.4167632213029769,
+      "learning_rate": 0.0001985018110094415,
+      "loss": 0.7761,
+      "step": 470
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.40252436993266555,
+      "learning_rate": 0.00019849186486826906,
+      "loss": 0.7222,
+      "step": 471
+    },
+    {
+      "epoch": 0.08391111111111112,
+      "grad_norm": 0.3719893170262049,
+      "learning_rate": 0.00019848188607197008,
+      "loss": 0.7024,
+      "step": 472
+    },
+    {
+      "epoch": 0.0840888888888889,
+      "grad_norm": 0.3798984521916213,
+      "learning_rate": 0.0001984718746238531,
+      "loss": 0.679,
+      "step": 473
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.40158972712404656,
+      "learning_rate": 0.00019846183052723738,
+      "loss": 0.7276,
+      "step": 474
+    },
+    {
+      "epoch": 0.08444444444444445,
+      "grad_norm": 0.38624497311934824,
+      "learning_rate": 0.0001984517537854531,
+      "loss": 0.7185,
+      "step": 475
+    },
+    {
+      "epoch": 0.08462222222222222,
+      "grad_norm": 0.3735580173669137,
+      "learning_rate": 0.00019844164440184118,
+      "loss": 0.6791,
+      "step": 476
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.3984033532195548,
+      "learning_rate": 0.00019843150237975344,
+      "loss": 0.7067,
+      "step": 477
+    },
+    {
+      "epoch": 0.08497777777777778,
+      "grad_norm": 0.46071691804757686,
+      "learning_rate": 0.00019842132772255244,
+      "loss": 0.6828,
+      "step": 478
+    },
+    {
+      "epoch": 0.08515555555555555,
+      "grad_norm": 0.3859545760101646,
+      "learning_rate": 0.0001984111204336116,
+      "loss": 0.6721,
+      "step": 479
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.3985767473496741,
+      "learning_rate": 0.00019840088051631524,
+      "loss": 0.682,
+      "step": 480
+    },
+    {
+      "epoch": 0.08551111111111111,
+      "grad_norm": 0.3710666445505344,
+      "learning_rate": 0.00019839060797405833,
+      "loss": 0.6725,
+      "step": 481
+    },
+    {
+      "epoch": 0.08568888888888888,
+      "grad_norm": 0.38819738107860136,
+      "learning_rate": 0.0001983803028102468,
+      "loss": 0.7446,
+      "step": 482
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.3747206310678951,
+      "learning_rate": 0.00019836996502829731,
+      "loss": 0.7149,
+      "step": 483
+    },
+    {
+      "epoch": 0.08604444444444445,
+      "grad_norm": 0.3960843239759529,
+      "learning_rate": 0.0001983595946316374,
+      "loss": 0.6744,
+      "step": 484
+    },
+    {
+      "epoch": 0.08622222222222223,
+      "grad_norm": 0.3860936100139274,
+      "learning_rate": 0.00019834919162370538,
+      "loss": 0.7233,
+      "step": 485
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.36889033128684595,
+      "learning_rate": 0.00019833875600795036,
+      "loss": 0.719,
+      "step": 486
+    },
+    {
+      "epoch": 0.08657777777777778,
+      "grad_norm": 0.40055611052787216,
+      "learning_rate": 0.0001983282877878323,
+      "loss": 0.7299,
+      "step": 487
+    },
+    {
+      "epoch": 0.08675555555555556,
+      "grad_norm": 0.39221882653240764,
+      "learning_rate": 0.00019831778696682194,
+      "loss": 0.685,
+      "step": 488
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.38394915717190387,
+      "learning_rate": 0.00019830725354840089,
+      "loss": 0.7292,
+      "step": 489
+    },
+    {
+      "epoch": 0.08711111111111111,
+      "grad_norm": 0.3898539840510562,
+      "learning_rate": 0.00019829668753606146,
+      "loss": 0.7667,
+      "step": 490
+    },
+    {
+      "epoch": 0.08728888888888889,
+      "grad_norm": 0.3885663815836747,
+      "learning_rate": 0.0001982860889333069,
+      "loss": 0.7478,
+      "step": 491
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.36612102259144147,
+      "learning_rate": 0.0001982754577436511,
+      "loss": 0.6771,
+      "step": 492
+    },
+    {
+      "epoch": 0.08764444444444444,
+      "grad_norm": 0.3987280709247257,
+      "learning_rate": 0.00019826479397061893,
+      "loss": 0.709,
+      "step": 493
+    },
+    {
+      "epoch": 0.08782222222222222,
+      "grad_norm": 0.4043564451495452,
+      "learning_rate": 0.00019825409761774592,
+      "loss": 0.7026,
+      "step": 494
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.37926429061357136,
+      "learning_rate": 0.00019824336868857852,
+      "loss": 0.6648,
+      "step": 495
+    },
+    {
+      "epoch": 0.08817777777777777,
+      "grad_norm": 0.36360149442951356,
+      "learning_rate": 0.00019823260718667386,
+      "loss": 0.6901,
+      "step": 496
+    },
+    {
+      "epoch": 0.08835555555555556,
+      "grad_norm": 0.39119767824550433,
+      "learning_rate": 0.00019822181311559994,
+      "loss": 0.6665,
+      "step": 497
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.3802362637359548,
+      "learning_rate": 0.0001982109864789356,
+      "loss": 0.7356,
+      "step": 498
+    },
+    {
+      "epoch": 0.08871111111111112,
+      "grad_norm": 0.39089413244118015,
+      "learning_rate": 0.00019820012728027044,
+      "loss": 0.7005,
+      "step": 499
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.38359311480973846,
+      "learning_rate": 0.00019818923552320476,
+      "loss": 0.67,
+      "step": 500
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.3887716612794801,
+      "learning_rate": 0.0001981783112113498,
+      "loss": 0.7046,
+      "step": 501
+    },
+    {
+      "epoch": 0.08924444444444445,
+      "grad_norm": 0.41701466560451533,
+      "learning_rate": 0.00019816735434832752,
+      "loss": 0.7471,
+      "step": 502
+    },
+    {
+      "epoch": 0.08942222222222222,
+      "grad_norm": 0.3953806694358842,
+      "learning_rate": 0.00019815636493777063,
+      "loss": 0.7136,
+      "step": 503
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.3961402619781841,
+      "learning_rate": 0.00019814534298332278,
+      "loss": 0.6828,
+      "step": 504
+    },
+    {
+      "epoch": 0.08977777777777778,
+      "grad_norm": 0.3672594146424926,
+      "learning_rate": 0.00019813428848863826,
+      "loss": 0.6584,
+      "step": 505
+    },
+    {
+      "epoch": 0.08995555555555555,
+      "grad_norm": 0.3850643115863915,
+      "learning_rate": 0.00019812320145738224,
+      "loss": 0.7869,
+      "step": 506
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.37077478089251176,
+      "learning_rate": 0.00019811208189323058,
+      "loss": 0.6796,
+      "step": 507
+    },
+    {
+      "epoch": 0.0903111111111111,
+      "grad_norm": 0.4078763220472397,
+      "learning_rate": 0.00019810092979987006,
+      "loss": 0.7009,
+      "step": 508
+    },
+    {
+      "epoch": 0.09048888888888888,
+      "grad_norm": 0.41811556406244266,
+      "learning_rate": 0.00019808974518099813,
+      "loss": 0.7427,
+      "step": 509
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.37551449192697767,
+      "learning_rate": 0.00019807852804032305,
+      "loss": 0.7324,
+      "step": 510
+    },
+    {
+      "epoch": 0.09084444444444445,
+      "grad_norm": 0.38812220151818816,
+      "learning_rate": 0.00019806727838156393,
+      "loss": 0.7076,
+      "step": 511
+    },
+    {
+      "epoch": 0.09102222222222223,
+      "grad_norm": 0.39630617978651994,
+      "learning_rate": 0.0001980559962084506,
+      "loss": 0.7518,
+      "step": 512
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.39678213342399604,
+      "learning_rate": 0.00019804468152472362,
+      "loss": 0.6769,
+      "step": 513
+    },
+    {
+      "epoch": 0.09137777777777778,
+      "grad_norm": 0.4162672008944878,
+      "learning_rate": 0.00019803333433413448,
+      "loss": 0.7087,
+      "step": 514
+    },
+    {
+      "epoch": 0.09155555555555556,
+      "grad_norm": 0.4009749516627074,
+      "learning_rate": 0.0001980219546404453,
+      "loss": 0.6927,
+      "step": 515
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.4075419407703099,
+      "learning_rate": 0.00019801054244742908,
+      "loss": 0.7545,
+      "step": 516
+    },
+    {
+      "epoch": 0.09191111111111111,
+      "grad_norm": 0.3889320504824982,
+      "learning_rate": 0.0001979990977588695,
+      "loss": 0.6547,
+      "step": 517
+    },
+    {
+      "epoch": 0.09208888888888889,
+      "grad_norm": 0.38812102304168816,
+      "learning_rate": 0.0001979876205785611,
+      "loss": 0.6839,
+      "step": 518
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.3955334347617926,
+      "learning_rate": 0.0001979761109103091,
+      "loss": 0.783,
+      "step": 519
+    },
+    {
+      "epoch": 0.09244444444444444,
+      "grad_norm": 0.37131327605098874,
+      "learning_rate": 0.00019796456875792963,
+      "loss": 0.733,
+      "step": 520
+    },
+    {
+      "epoch": 0.09262222222222222,
+      "grad_norm": 0.37768671406181936,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 0.6978,
+      "step": 521
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4046767262774699,
+      "learning_rate": 0.00019794138701610618,
+      "loss": 0.7884,
+      "step": 522
+    },
+    {
+      "epoch": 0.09297777777777778,
+      "grad_norm": 0.39433272676675823,
+      "learning_rate": 0.00019792974743434815,
+      "loss": 0.7499,
+      "step": 523
+    },
+    {
+      "epoch": 0.09315555555555556,
+      "grad_norm": 0.39129637695326064,
+      "learning_rate": 0.0001979180753838345,
+      "loss": 0.7109,
+      "step": 524
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.4035082219548878,
+      "learning_rate": 0.0001979063708684351,
+      "loss": 0.7287,
+      "step": 525
+    },
+    {
+      "epoch": 0.09351111111111111,
+      "grad_norm": 0.3942976843456556,
+      "learning_rate": 0.00019789463389203064,
+      "loss": 0.7476,
+      "step": 526
+    },
+    {
+      "epoch": 0.09368888888888889,
+      "grad_norm": 0.4523195035487732,
+      "learning_rate": 0.00019788286445851245,
+      "loss": 0.7001,
+      "step": 527
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.38904772680540983,
+      "learning_rate": 0.00019787106257178276,
+      "loss": 0.6834,
+      "step": 528
+    },
+    {
+      "epoch": 0.09404444444444444,
+      "grad_norm": 0.4040685501074588,
+      "learning_rate": 0.00019785922823575448,
+      "loss": 0.7488,
+      "step": 529
+    },
+    {
+      "epoch": 0.09422222222222222,
+      "grad_norm": 0.37001074183000726,
+      "learning_rate": 0.00019784736145435136,
+      "loss": 0.6912,
+      "step": 530
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.37605916188227884,
+      "learning_rate": 0.0001978354622315078,
+      "loss": 0.6769,
+      "step": 531
+    },
+    {
+      "epoch": 0.09457777777777777,
+      "grad_norm": 0.3863731577389576,
+      "learning_rate": 0.000197823530571169,
+      "loss": 0.7146,
+      "step": 532
+    },
+    {
+      "epoch": 0.09475555555555555,
+      "grad_norm": 0.38784027680178806,
+      "learning_rate": 0.00019781156647729093,
+      "loss": 0.7091,
+      "step": 533
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.39479264286450233,
+      "learning_rate": 0.00019779956995384033,
+      "loss": 0.7202,
+      "step": 534
+    },
+    {
+      "epoch": 0.0951111111111111,
+      "grad_norm": 0.388132039918643,
+      "learning_rate": 0.0001977875410047946,
+      "loss": 0.6781,
+      "step": 535
+    },
+    {
+      "epoch": 0.0952888888888889,
+      "grad_norm": 0.39178644436478455,
+      "learning_rate": 0.000197775479634142,
+      "loss": 0.7615,
+      "step": 536
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.37566978094263886,
+      "learning_rate": 0.00019776338584588153,
+      "loss": 0.6683,
+      "step": 537
+    },
+    {
+      "epoch": 0.09564444444444445,
+      "grad_norm": 0.40332078693756196,
+      "learning_rate": 0.00019775125964402283,
+      "loss": 0.7358,
+      "step": 538
+    },
+    {
+      "epoch": 0.09582222222222223,
+      "grad_norm": 0.39465881375258033,
+      "learning_rate": 0.0001977391010325864,
+      "loss": 0.7199,
+      "step": 539
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.39739941388198086,
+      "learning_rate": 0.0001977269100156035,
+      "loss": 0.7152,
+      "step": 540
+    },
+    {
+      "epoch": 0.09617777777777778,
+      "grad_norm": 0.40349461005481413,
+      "learning_rate": 0.00019771468659711595,
+      "loss": 0.7405,
+      "step": 541
+    },
+    {
+      "epoch": 0.09635555555555556,
+      "grad_norm": 0.38314651002370476,
+      "learning_rate": 0.00019770243078117656,
+      "loss": 0.7485,
+      "step": 542
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.37249511714783334,
+      "learning_rate": 0.0001976901425718487,
+      "loss": 0.7154,
+      "step": 543
+    },
+    {
+      "epoch": 0.09671111111111111,
+      "grad_norm": 0.43362403261005483,
+      "learning_rate": 0.0001976778219732066,
+      "loss": 0.6861,
+      "step": 544
+    },
+    {
+      "epoch": 0.09688888888888889,
+      "grad_norm": 0.38982482782879585,
+      "learning_rate": 0.00019766546898933508,
+      "loss": 0.7075,
+      "step": 545
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.3949764018822734,
+      "learning_rate": 0.00019765308362432987,
+      "loss": 0.7055,
+      "step": 546
+    },
+    {
+      "epoch": 0.09724444444444444,
+      "grad_norm": 0.3691251495122129,
+      "learning_rate": 0.00019764066588229734,
+      "loss": 0.7227,
+      "step": 547
+    },
+    {
+      "epoch": 0.09742222222222222,
+      "grad_norm": 0.42663727490235365,
+      "learning_rate": 0.00019762821576735463,
+      "loss": 0.7703,
+      "step": 548
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.3800437550005673,
+      "learning_rate": 0.00019761573328362953,
+      "loss": 0.676,
+      "step": 549
+    },
+    {
+      "epoch": 0.09777777777777778,
+      "grad_norm": 0.4583794656278647,
+      "learning_rate": 0.0001976032184352607,
+      "loss": 0.6876,
+      "step": 550
+    },
+    {
+      "epoch": 0.09795555555555556,
+      "grad_norm": 0.4920240888261474,
+      "learning_rate": 0.00019759067122639742,
+      "loss": 0.7439,
+      "step": 551
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.37811029285341824,
+      "learning_rate": 0.0001975780916611997,
+      "loss": 0.6923,
+      "step": 552
+    },
+    {
+      "epoch": 0.09831111111111111,
+      "grad_norm": 0.3841859644376975,
+      "learning_rate": 0.0001975654797438384,
+      "loss": 0.6769,
+      "step": 553
+    },
+    {
+      "epoch": 0.09848888888888889,
+      "grad_norm": 0.37061538644462133,
+      "learning_rate": 0.00019755283547849494,
+      "loss": 0.6762,
+      "step": 554
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.3760153276671012,
+      "learning_rate": 0.0001975401588693616,
+      "loss": 0.6798,
+      "step": 555
+    },
+    {
+      "epoch": 0.09884444444444444,
+      "grad_norm": 0.3912361835185172,
+      "learning_rate": 0.0001975274499206413,
+      "loss": 0.6902,
+      "step": 556
+    },
+    {
+      "epoch": 0.09902222222222222,
+      "grad_norm": 0.4283050030644034,
+      "learning_rate": 0.00019751470863654772,
+      "loss": 0.7364,
+      "step": 557
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3853017088387168,
+      "learning_rate": 0.00019750193502130525,
+      "loss": 0.6964,
+      "step": 558
+    },
+    {
+      "epoch": 0.09937777777777777,
+      "grad_norm": 0.3991612722825895,
+      "learning_rate": 0.000197489129079149,
+      "loss": 0.6922,
+      "step": 559
+    },
+    {
+      "epoch": 0.09955555555555555,
+      "grad_norm": 0.3960774506964915,
+      "learning_rate": 0.0001974762908143248,
+      "loss": 0.666,
+      "step": 560
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.3841384622364195,
+      "learning_rate": 0.0001974634202310892,
+      "loss": 0.713,
+      "step": 561
+    },
+    {
+      "epoch": 0.09991111111111112,
+      "grad_norm": 0.39415126927713723,
+      "learning_rate": 0.00019745051733370948,
+      "loss": 0.6728,
+      "step": 562
+    },
+    {
+      "epoch": 0.1000888888888889,
+      "grad_norm": 0.39297294397494387,
+      "learning_rate": 0.00019743758212646358,
+      "loss": 0.7126,
+      "step": 563
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.3666371431502856,
+      "learning_rate": 0.00019742461461364017,
+      "loss": 0.6904,
+      "step": 564
+    },
+    {
+      "epoch": 0.10044444444444445,
+      "grad_norm": 0.4048054635796692,
+      "learning_rate": 0.0001974116147995387,
+      "loss": 0.6987,
+      "step": 565
+    },
+    {
+      "epoch": 0.10062222222222222,
+      "grad_norm": 0.39857663269149735,
+      "learning_rate": 0.00019739858268846928,
+      "loss": 0.6878,
+      "step": 566
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.3841995053260552,
+      "learning_rate": 0.0001973855182847527,
+      "loss": 0.7525,
+      "step": 567
+    },
+    {
+      "epoch": 0.10097777777777778,
+      "grad_norm": 0.39437695649325905,
+      "learning_rate": 0.00019737242159272047,
+      "loss": 0.7416,
+      "step": 568
+    },
+    {
+      "epoch": 0.10115555555555555,
+      "grad_norm": 0.4253403948667029,
+      "learning_rate": 0.00019735929261671485,
+      "loss": 0.7424,
+      "step": 569
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.3726733760212832,
+      "learning_rate": 0.00019734613136108875,
+      "loss": 0.7197,
+      "step": 570
+    },
+    {
+      "epoch": 0.10151111111111111,
+      "grad_norm": 0.39698138722245324,
+      "learning_rate": 0.0001973329378302058,
+      "loss": 0.714,
+      "step": 571
+    },
+    {
+      "epoch": 0.10168888888888888,
+      "grad_norm": 0.3974100754920948,
+      "learning_rate": 0.00019731971202844036,
+      "loss": 0.7117,
+      "step": 572
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.377398414120431,
+      "learning_rate": 0.00019730645396017743,
+      "loss": 0.6748,
+      "step": 573
+    },
+    {
+      "epoch": 0.10204444444444444,
+      "grad_norm": 0.38139272096193433,
+      "learning_rate": 0.0001972931636298128,
+      "loss": 0.6772,
+      "step": 574
+    },
+    {
+      "epoch": 0.10222222222222223,
+      "grad_norm": 0.3869568852013764,
+      "learning_rate": 0.00019727984104175284,
+      "loss": 0.7045,
+      "step": 575
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3882846699122299,
+      "learning_rate": 0.00019726648620041468,
+      "loss": 0.6897,
+      "step": 576
+    },
+    {
+      "epoch": 0.10257777777777778,
+      "grad_norm": 0.3996476684381144,
+      "learning_rate": 0.00019725309911022617,
+      "loss": 0.735,
+      "step": 577
+    },
+    {
+      "epoch": 0.10275555555555556,
+      "grad_norm": 0.38720619170224296,
+      "learning_rate": 0.00019723967977562583,
+      "loss": 0.6308,
+      "step": 578
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.39794186627128897,
+      "learning_rate": 0.0001972262282010628,
+      "loss": 0.6839,
+      "step": 579
+    },
+    {
+      "epoch": 0.10311111111111111,
+      "grad_norm": 0.4167802324572103,
+      "learning_rate": 0.00019721274439099703,
+      "loss": 0.6449,
+      "step": 580
+    },
+    {
+      "epoch": 0.10328888888888889,
+      "grad_norm": 0.39156390031788213,
+      "learning_rate": 0.00019719922834989906,
+      "loss": 0.6392,
+      "step": 581
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.39394650573347684,
+      "learning_rate": 0.00019718568008225015,
+      "loss": 0.713,
+      "step": 582
+    },
+    {
+      "epoch": 0.10364444444444444,
+      "grad_norm": 0.43288158874449467,
+      "learning_rate": 0.0001971720995925423,
+      "loss": 0.7239,
+      "step": 583
+    },
+    {
+      "epoch": 0.10382222222222222,
+      "grad_norm": 0.43425034558482667,
+      "learning_rate": 0.0001971584868852781,
+      "loss": 0.7546,
+      "step": 584
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.36768709439661523,
+      "learning_rate": 0.00019714484196497084,
+      "loss": 0.6806,
+      "step": 585
+    },
+    {
+      "epoch": 0.10417777777777777,
+      "grad_norm": 0.39288106509619114,
+      "learning_rate": 0.00019713116483614456,
+      "loss": 0.7649,
+      "step": 586
+    },
+    {
+      "epoch": 0.10435555555555555,
+      "grad_norm": 0.38031087043482165,
+      "learning_rate": 0.0001971174555033339,
+      "loss": 0.7131,
+      "step": 587
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.39189997310837155,
+      "learning_rate": 0.00019710371397108425,
+      "loss": 0.6849,
+      "step": 588
+    },
+    {
+      "epoch": 0.10471111111111112,
+      "grad_norm": 0.36646347039971455,
+      "learning_rate": 0.0001970899402439516,
+      "loss": 0.6606,
+      "step": 589
+    },
+    {
+      "epoch": 0.10488888888888889,
+      "grad_norm": 0.3833475667981473,
+      "learning_rate": 0.0001970761343265027,
+      "loss": 0.7094,
+      "step": 590
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.3915767845271623,
+      "learning_rate": 0.00019706229622331486,
+      "loss": 0.6902,
+      "step": 591
+    },
+    {
+      "epoch": 0.10524444444444445,
+      "grad_norm": 0.3841717252940577,
+      "learning_rate": 0.00019704842593897613,
+      "loss": 0.6678,
+      "step": 592
+    },
+    {
+      "epoch": 0.10542222222222222,
+      "grad_norm": 0.3925414218590898,
+      "learning_rate": 0.00019703452347808527,
+      "loss": 0.6842,
+      "step": 593
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.3924214417851129,
+      "learning_rate": 0.00019702058884525162,
+      "loss": 0.668,
+      "step": 594
+    },
+    {
+      "epoch": 0.10577777777777778,
+      "grad_norm": 0.378592348409656,
+      "learning_rate": 0.00019700662204509523,
+      "loss": 0.7097,
+      "step": 595
+    },
+    {
+      "epoch": 0.10595555555555555,
+      "grad_norm": 0.3948263616879013,
+      "learning_rate": 0.00019699262308224688,
+      "loss": 0.6849,
+      "step": 596
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.42679657208352684,
+      "learning_rate": 0.00019697859196134786,
+      "loss": 0.6851,
+      "step": 597
+    },
+    {
+      "epoch": 0.1063111111111111,
+      "grad_norm": 0.42252954161449646,
+      "learning_rate": 0.00019696452868705024,
+      "loss": 0.7239,
+      "step": 598
+    },
+    {
+      "epoch": 0.10648888888888888,
+      "grad_norm": 0.4034722794383333,
+      "learning_rate": 0.00019695043326401672,
+      "loss": 0.7172,
+      "step": 599
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.3730100807720244,
+      "learning_rate": 0.00019693630569692067,
+      "loss": 0.7048,
+      "step": 600
+    },
+    {
+      "epoch": 0.10684444444444445,
+      "grad_norm": 0.40381360093897134,
+      "learning_rate": 0.0001969221459904461,
+      "loss": 0.6368,
+      "step": 601
+    },
+    {
+      "epoch": 0.10702222222222223,
+      "grad_norm": 0.373362612896415,
+      "learning_rate": 0.0001969079541492877,
+      "loss": 0.6612,
+      "step": 602
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.3961177828699534,
+      "learning_rate": 0.00019689373017815073,
+      "loss": 0.7093,
+      "step": 603
+    },
+    {
+      "epoch": 0.10737777777777778,
+      "grad_norm": 0.38292537668712084,
+      "learning_rate": 0.00019687947408175127,
+      "loss": 0.666,
+      "step": 604
+    },
+    {
+      "epoch": 0.10755555555555556,
+      "grad_norm": 0.3962220505027578,
+      "learning_rate": 0.00019686518586481587,
+      "loss": 0.7275,
+      "step": 605
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.384398611311203,
+      "learning_rate": 0.00019685086553208184,
+      "loss": 0.7117,
+      "step": 606
+    },
+    {
+      "epoch": 0.10791111111111111,
+      "grad_norm": 0.3863803568444556,
+      "learning_rate": 0.0001968365130882971,
+      "loss": 0.6865,
+      "step": 607
+    },
+    {
+      "epoch": 0.10808888888888889,
+      "grad_norm": 0.38113676782357997,
+      "learning_rate": 0.00019682212853822022,
+      "loss": 0.6771,
+      "step": 608
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.390287641585076,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 0.7175,
+      "step": 609
+    },
+    {
+      "epoch": 0.10844444444444444,
+      "grad_norm": 0.3960456752266383,
+      "learning_rate": 0.00019679326313827762,
+      "loss": 0.7158,
+      "step": 610
+    },
+    {
+      "epoch": 0.10862222222222222,
+      "grad_norm": 0.423204349538445,
+      "learning_rate": 0.00019677878229798224,
+      "loss": 0.6754,
+      "step": 611
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.405886325521896,
+      "learning_rate": 0.00019676426937053547,
+      "loss": 0.6894,
+      "step": 612
+    },
+    {
+      "epoch": 0.10897777777777778,
+      "grad_norm": 0.38614058748178565,
+      "learning_rate": 0.00019674972436074906,
+      "loss": 0.6714,
+      "step": 613
+    },
+    {
+      "epoch": 0.10915555555555556,
+      "grad_norm": 0.40135755079444874,
+      "learning_rate": 0.00019673514727344547,
+      "loss": 0.7115,
+      "step": 614
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.3767696205104445,
+      "learning_rate": 0.00019672053811345774,
+      "loss": 0.6975,
+      "step": 615
+    },
+    {
+      "epoch": 0.10951111111111111,
+      "grad_norm": 0.3707750636427583,
+      "learning_rate": 0.00019670589688562955,
+      "loss": 0.7226,
+      "step": 616
+    },
+    {
+      "epoch": 0.10968888888888889,
+      "grad_norm": 0.3557441208059503,
+      "learning_rate": 0.00019669122359481525,
+      "loss": 0.6843,
+      "step": 617
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.3587624777182352,
+      "learning_rate": 0.00019667651824587976,
+      "loss": 0.6983,
+      "step": 618
+    },
+    {
+      "epoch": 0.11004444444444444,
+      "grad_norm": 0.3868661344532939,
+      "learning_rate": 0.00019666178084369867,
+      "loss": 0.6805,
+      "step": 619
+    },
+    {
+      "epoch": 0.11022222222222222,
+      "grad_norm": 0.38114079818793906,
+      "learning_rate": 0.0001966470113931582,
+      "loss": 0.7049,
+      "step": 620
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.3785693962728146,
+      "learning_rate": 0.00019663220989915513,
+      "loss": 0.6708,
+      "step": 621
+    },
+    {
+      "epoch": 0.11057777777777777,
+      "grad_norm": 0.3920112018906953,
+      "learning_rate": 0.00019661737636659696,
+      "loss": 0.6892,
+      "step": 622
+    },
+    {
+      "epoch": 0.11075555555555555,
+      "grad_norm": 0.3884063373305085,
+      "learning_rate": 0.0001966025108004018,
+      "loss": 0.6566,
+      "step": 623
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.41569158134959805,
+      "learning_rate": 0.00019658761320549833,
+      "loss": 0.7504,
+      "step": 624
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.40388249896819356,
+      "learning_rate": 0.00019657268358682584,
+      "loss": 0.7174,
+      "step": 625
+    },
+    {
+      "epoch": 0.1112888888888889,
+      "grad_norm": 0.3738884454707474,
+      "learning_rate": 0.00019655772194933428,
+      "loss": 0.6927,
+      "step": 626
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.39642178791794436,
+      "learning_rate": 0.00019654272829798425,
+      "loss": 0.7324,
+      "step": 627
+    },
+    {
+      "epoch": 0.11164444444444445,
+      "grad_norm": 0.3647370336208459,
+      "learning_rate": 0.0001965277026377468,
+      "loss": 0.6447,
+      "step": 628
+    },
+    {
+      "epoch": 0.11182222222222223,
+      "grad_norm": 0.40055350302841236,
+      "learning_rate": 0.00019651264497360388,
+      "loss": 0.719,
+      "step": 629
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.468614952268159,
+      "learning_rate": 0.00019649755531054777,
+      "loss": 0.7535,
+      "step": 630
+    },
+    {
+      "epoch": 0.11217777777777778,
+      "grad_norm": 0.385594063029447,
+      "learning_rate": 0.00019648243365358146,
+      "loss": 0.6687,
+      "step": 631
+    },
+    {
+      "epoch": 0.11235555555555556,
+      "grad_norm": 0.3991226740325378,
+      "learning_rate": 0.00019646728000771862,
+      "loss": 0.7024,
+      "step": 632
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.40191609381625604,
+      "learning_rate": 0.0001964520943779834,
+      "loss": 0.6457,
+      "step": 633
+    },
+    {
+      "epoch": 0.11271111111111111,
+      "grad_norm": 0.37586566848670044,
+      "learning_rate": 0.00019643687676941068,
+      "loss": 0.7147,
+      "step": 634
+    },
+    {
+      "epoch": 0.11288888888888889,
+      "grad_norm": 0.40400999422691786,
+      "learning_rate": 0.00019642162718704585,
+      "loss": 0.6804,
+      "step": 635
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.3709453489046353,
+      "learning_rate": 0.00019640634563594496,
+      "loss": 0.6882,
+      "step": 636
+    },
+    {
+      "epoch": 0.11324444444444444,
+      "grad_norm": 0.4134633703381982,
+      "learning_rate": 0.0001963910321211746,
+      "loss": 0.7011,
+      "step": 637
+    },
+    {
+      "epoch": 0.11342222222222222,
+      "grad_norm": 0.40052956636237336,
+      "learning_rate": 0.00019637568664781195,
+      "loss": 0.6928,
+      "step": 638
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.38200631677661545,
+      "learning_rate": 0.0001963603092209449,
+      "loss": 0.7548,
+      "step": 639
+    },
+    {
+      "epoch": 0.11377777777777778,
+      "grad_norm": 0.41164011775950954,
+      "learning_rate": 0.00019634489984567184,
+      "loss": 0.7364,
+      "step": 640
+    },
+    {
+      "epoch": 0.11395555555555556,
+      "grad_norm": 0.3838618376968505,
+      "learning_rate": 0.00019632945852710173,
+      "loss": 0.6849,
+      "step": 641
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.38168638484642287,
+      "learning_rate": 0.00019631398527035422,
+      "loss": 0.7166,
+      "step": 642
+    },
+    {
+      "epoch": 0.11431111111111111,
+      "grad_norm": 0.4015902414905251,
+      "learning_rate": 0.00019629848008055948,
+      "loss": 0.6678,
+      "step": 643
+    },
+    {
+      "epoch": 0.11448888888888889,
+      "grad_norm": 0.38283172732806114,
+      "learning_rate": 0.00019628294296285823,
+      "loss": 0.6168,
+      "step": 644
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.39527449961864114,
+      "learning_rate": 0.00019626737392240188,
+      "loss": 0.7112,
+      "step": 645
+    },
+    {
+      "epoch": 0.11484444444444444,
+      "grad_norm": 0.3983478136997403,
+      "learning_rate": 0.00019625177296435234,
+      "loss": 0.7185,
+      "step": 646
+    },
+    {
+      "epoch": 0.11502222222222222,
+      "grad_norm": 0.40437207416312704,
+      "learning_rate": 0.00019623614009388216,
+      "loss": 0.7417,
+      "step": 647
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.39653682733883383,
+      "learning_rate": 0.0001962204753161744,
+      "loss": 0.7166,
+      "step": 648
+    },
+    {
+      "epoch": 0.11537777777777777,
+      "grad_norm": 0.392990450268175,
+      "learning_rate": 0.00019620477863642276,
+      "loss": 0.6733,
+      "step": 649
+    },
+    {
+      "epoch": 0.11555555555555555,
+      "grad_norm": 0.3929897861104475,
+      "learning_rate": 0.0001961890500598315,
+      "loss": 0.7326,
+      "step": 650
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.41879292758556386,
+      "learning_rate": 0.0001961732895916155,
+      "loss": 0.729,
+      "step": 651
+    },
+    {
+      "epoch": 0.11591111111111112,
+      "grad_norm": 0.4046234724717051,
+      "learning_rate": 0.00019615749723700008,
+      "loss": 0.7067,
+      "step": 652
+    },
+    {
+      "epoch": 0.1160888888888889,
+      "grad_norm": 0.4189170353654842,
+      "learning_rate": 0.00019614167300122126,
+      "loss": 0.6833,
+      "step": 653
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.40659796045052804,
+      "learning_rate": 0.0001961258168895256,
+      "loss": 0.7268,
+      "step": 654
+    },
+    {
+      "epoch": 0.11644444444444445,
+      "grad_norm": 0.42805021198864035,
+      "learning_rate": 0.00019610992890717018,
+      "loss": 0.7046,
+      "step": 655
+    },
+    {
+      "epoch": 0.11662222222222222,
+      "grad_norm": 0.3790159028051678,
+      "learning_rate": 0.00019609400905942274,
+      "loss": 0.6683,
+      "step": 656
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.3990134364124909,
+      "learning_rate": 0.0001960780573515615,
+      "loss": 0.7048,
+      "step": 657
+    },
+    {
+      "epoch": 0.11697777777777778,
+      "grad_norm": 0.399216113387533,
+      "learning_rate": 0.00019606207378887523,
+      "loss": 0.727,
+      "step": 658
+    },
+    {
+      "epoch": 0.11715555555555555,
+      "grad_norm": 0.4001163088053081,
+      "learning_rate": 0.0001960460583766634,
+      "loss": 0.719,
+      "step": 659
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.40635720899063404,
+      "learning_rate": 0.0001960300111202359,
+      "loss": 0.7201,
+      "step": 660
+    },
+    {
+      "epoch": 0.11751111111111111,
+      "grad_norm": 0.4302614248547872,
+      "learning_rate": 0.00019601393202491315,
+      "loss": 0.6858,
+      "step": 661
+    },
+    {
+      "epoch": 0.11768888888888888,
+      "grad_norm": 0.3632270023803551,
+      "learning_rate": 0.00019599782109602632,
+      "loss": 0.6455,
+      "step": 662
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.39515467618320493,
+      "learning_rate": 0.00019598167833891693,
+      "loss": 0.7083,
+      "step": 663
+    },
+    {
+      "epoch": 0.11804444444444444,
+      "grad_norm": 0.41578117914046,
+      "learning_rate": 0.0001959655037589372,
+      "loss": 0.7265,
+      "step": 664
+    },
+    {
+      "epoch": 0.11822222222222223,
+      "grad_norm": 0.4151397514822464,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.717,
+      "step": 665
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.39651625885923303,
+      "learning_rate": 0.00019593305915182788,
+      "loss": 0.7298,
+      "step": 666
+    },
+    {
+      "epoch": 0.11857777777777778,
+      "grad_norm": 0.4206811369156842,
+      "learning_rate": 0.00019591678913545544,
+      "loss": 0.7382,
+      "step": 667
+    },
+    {
+      "epoch": 0.11875555555555556,
+      "grad_norm": 0.38427741776445884,
+      "learning_rate": 0.0001959004873177267,
+      "loss": 0.7221,
+      "step": 668
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.36996537792351825,
+      "learning_rate": 0.0001958841537040466,
+      "loss": 0.6857,
+      "step": 669
+    },
+    {
+      "epoch": 0.11911111111111111,
+      "grad_norm": 0.3930460323318283,
+      "learning_rate": 0.00019586778829983054,
+      "loss": 0.7034,
+      "step": 670
+    },
+    {
+      "epoch": 0.11928888888888889,
+      "grad_norm": 0.37366446401002124,
+      "learning_rate": 0.00019585139111050453,
+      "loss": 0.7149,
+      "step": 671
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.3736414488492699,
+      "learning_rate": 0.00019583496214150507,
+      "loss": 0.6984,
+      "step": 672
+    },
+    {
+      "epoch": 0.11964444444444444,
+      "grad_norm": 0.38685954327606714,
+      "learning_rate": 0.0001958185013982792,
+      "loss": 0.676,
+      "step": 673
+    },
+    {
+      "epoch": 0.11982222222222222,
+      "grad_norm": 0.3796635393623509,
+      "learning_rate": 0.00019580200888628452,
+      "loss": 0.6818,
+      "step": 674
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4034613468508447,
+      "learning_rate": 0.00019578548461098914,
+      "loss": 0.6914,
+      "step": 675
+    },
+    {
+      "epoch": 0.12017777777777777,
+      "grad_norm": 0.38839898897411546,
+      "learning_rate": 0.0001957689285778717,
+      "loss": 0.7468,
+      "step": 676
+    },
+    {
+      "epoch": 0.12035555555555555,
+      "grad_norm": 0.380472166037475,
+      "learning_rate": 0.00019575234079242143,
+      "loss": 0.692,
+      "step": 677
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.38844141026547785,
+      "learning_rate": 0.00019573572126013793,
+      "loss": 0.6743,
+      "step": 678
+    },
+    {
+      "epoch": 0.12071111111111112,
+      "grad_norm": 0.3810917695249503,
+      "learning_rate": 0.00019571906998653148,
+      "loss": 0.7074,
+      "step": 679
+    },
+    {
+      "epoch": 0.12088888888888889,
+      "grad_norm": 0.40683239492813295,
+      "learning_rate": 0.0001957023869771229,
+      "loss": 0.7602,
+      "step": 680
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.3725691518485805,
+      "learning_rate": 0.00019568567223744339,
+      "loss": 0.6327,
+      "step": 681
+    },
+    {
+      "epoch": 0.12124444444444445,
+      "grad_norm": 0.39774666940776254,
+      "learning_rate": 0.00019566892577303478,
+      "loss": 0.7107,
+      "step": 682
+    },
+    {
+      "epoch": 0.12142222222222222,
+      "grad_norm": 0.39312447206654133,
+      "learning_rate": 0.00019565214758944936,
+      "loss": 0.686,
+      "step": 683
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3957977816573584,
+      "learning_rate": 0.00019563533769225,
+      "loss": 0.6977,
+      "step": 684
+    },
+    {
+      "epoch": 0.12177777777777778,
+      "grad_norm": 0.37847264259650193,
+      "learning_rate": 0.00019561849608700998,
+      "loss": 0.6631,
+      "step": 685
+    },
+    {
+      "epoch": 0.12195555555555555,
+      "grad_norm": 0.437690221060892,
+      "learning_rate": 0.00019560162277931325,
+      "loss": 0.7385,
+      "step": 686
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.417906191495558,
+      "learning_rate": 0.00019558471777475413,
+      "loss": 0.6765,
+      "step": 687
+    },
+    {
+      "epoch": 0.1223111111111111,
+      "grad_norm": 0.3843418399254885,
+      "learning_rate": 0.00019556778107893748,
+      "loss": 0.6687,
+      "step": 688
+    },
+    {
+      "epoch": 0.12248888888888888,
+      "grad_norm": 0.39276975611926834,
+      "learning_rate": 0.00019555081269747877,
+      "loss": 0.6949,
+      "step": 689
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.38514665606398174,
+      "learning_rate": 0.00019553381263600378,
+      "loss": 0.6814,
+      "step": 690
+    },
+    {
+      "epoch": 0.12284444444444445,
+      "grad_norm": 0.39437780132172745,
+      "learning_rate": 0.00019551678090014898,
+      "loss": 0.7068,
+      "step": 691
+    },
+    {
+      "epoch": 0.12302222222222223,
+      "grad_norm": 0.3895435528336699,
+      "learning_rate": 0.00019549971749556125,
+      "loss": 0.6872,
+      "step": 692
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.40143956438050515,
+      "learning_rate": 0.00019548262242789796,
+      "loss": 0.6738,
+      "step": 693
+    },
+    {
+      "epoch": 0.12337777777777778,
+      "grad_norm": 0.3868980566707207,
+      "learning_rate": 0.00019546549570282707,
+      "loss": 0.7085,
+      "step": 694
+    },
+    {
+      "epoch": 0.12355555555555556,
+      "grad_norm": 0.37660475618434147,
+      "learning_rate": 0.00019544833732602692,
+      "loss": 0.6794,
+      "step": 695
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.38183188794054557,
+      "learning_rate": 0.0001954311473031864,
+      "loss": 0.7068,
+      "step": 696
+    },
+    {
+      "epoch": 0.12391111111111111,
+      "grad_norm": 0.4095031622494095,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 0.7735,
+      "step": 697
+    },
+    {
+      "epoch": 0.12408888888888889,
+      "grad_norm": 0.3655706106021032,
+      "learning_rate": 0.00019539667234219228,
+      "loss": 0.702,
+      "step": 698
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.3785239408015017,
+      "learning_rate": 0.0001953793874154689,
+      "loss": 0.7534,
+      "step": 699
+    },
+    {
+      "epoch": 0.12444444444444444,
+      "grad_norm": 0.3727241968243882,
+      "learning_rate": 0.00019536207086556564,
+      "loss": 0.6917,
+      "step": 700
+    },
+    {
+      "epoch": 0.12462222222222222,
+      "grad_norm": 0.37439415888648636,
+      "learning_rate": 0.00019534472269822377,
+      "loss": 0.6591,
+      "step": 701
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4015846498161762,
+      "learning_rate": 0.00019532734291919512,
+      "loss": 0.7426,
+      "step": 702
+    },
+    {
+      "epoch": 0.12497777777777777,
+      "grad_norm": 0.40124234056468516,
+      "learning_rate": 0.00019530993153424198,
+      "loss": 0.6943,
+      "step": 703
+    },
+    {
+      "epoch": 0.12515555555555555,
+      "grad_norm": 0.3850891059208508,
+      "learning_rate": 0.00019529248854913714,
+      "loss": 0.6583,
+      "step": 704
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.36339471960397257,
+      "learning_rate": 0.00019527501396966382,
+      "loss": 0.6505,
+      "step": 705
+    },
+    {
+      "epoch": 0.1255111111111111,
+      "grad_norm": 0.3676211036995528,
+      "learning_rate": 0.0001952575078016158,
+      "loss": 0.6937,
+      "step": 706
+    },
+    {
+      "epoch": 0.12568888888888888,
+      "grad_norm": 0.3773537720277207,
+      "learning_rate": 0.0001952399700507972,
+      "loss": 0.7101,
+      "step": 707
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.35930125331897794,
+      "learning_rate": 0.00019522240072302274,
+      "loss": 0.6799,
+      "step": 708
+    },
+    {
+      "epoch": 0.12604444444444443,
+      "grad_norm": 0.40235758973764196,
+      "learning_rate": 0.00019520479982411754,
+      "loss": 0.7322,
+      "step": 709
+    },
+    {
+      "epoch": 0.12622222222222224,
+      "grad_norm": 0.3927794035142052,
+      "learning_rate": 0.0001951871673599172,
+      "loss": 0.7367,
+      "step": 710
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.4009191490683467,
+      "learning_rate": 0.0001951695033362678,
+      "loss": 0.7104,
+      "step": 711
+    },
+    {
+      "epoch": 0.1265777777777778,
+      "grad_norm": 0.40597785126656877,
+      "learning_rate": 0.00019515180775902586,
+      "loss": 0.7197,
+      "step": 712
+    },
+    {
+      "epoch": 0.12675555555555557,
+      "grad_norm": 0.3744733301622407,
+      "learning_rate": 0.00019513408063405837,
+      "loss": 0.662,
+      "step": 713
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.40420665526701016,
+      "learning_rate": 0.00019511632196724286,
+      "loss": 0.74,
+      "step": 714
+    },
+    {
+      "epoch": 0.12711111111111112,
+      "grad_norm": 0.37712067427193463,
+      "learning_rate": 0.00019509853176446712,
+      "loss": 0.7177,
+      "step": 715
+    },
+    {
+      "epoch": 0.1272888888888889,
+      "grad_norm": 0.3814008249906852,
+      "learning_rate": 0.0001950807100316296,
+      "loss": 0.6833,
+      "step": 716
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.37943427228223975,
+      "learning_rate": 0.0001950628567746391,
+      "loss": 0.6702,
+      "step": 717
+    },
+    {
+      "epoch": 0.12764444444444445,
+      "grad_norm": 0.4034065670836774,
+      "learning_rate": 0.00019504497199941491,
+      "loss": 0.6401,
+      "step": 718
+    },
+    {
+      "epoch": 0.12782222222222223,
+      "grad_norm": 0.4310192209484912,
+      "learning_rate": 0.00019502705571188672,
+      "loss": 0.6937,
+      "step": 719
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3838845895439845,
+      "learning_rate": 0.00019500910791799475,
+      "loss": 0.7432,
+      "step": 720
+    },
+    {
+      "epoch": 0.12817777777777778,
+      "grad_norm": 0.40288754937412263,
+      "learning_rate": 0.0001949911286236896,
+      "loss": 0.676,
+      "step": 721
+    },
+    {
+      "epoch": 0.12835555555555556,
+      "grad_norm": 0.9668915824422284,
+      "learning_rate": 0.0001949731178349323,
+      "loss": 0.6898,
+      "step": 722
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.37327470703590615,
+      "learning_rate": 0.0001949550755576944,
+      "loss": 0.6726,
+      "step": 723
+    },
+    {
+      "epoch": 0.1287111111111111,
+      "grad_norm": 0.38951041324497787,
+      "learning_rate": 0.00019493700179795779,
+      "loss": 0.7065,
+      "step": 724
+    },
+    {
+      "epoch": 0.1288888888888889,
+      "grad_norm": 0.3592578967476502,
+      "learning_rate": 0.0001949188965617149,
+      "loss": 0.6778,
+      "step": 725
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.4192139955894857,
+      "learning_rate": 0.00019490075985496857,
+      "loss": 0.6891,
+      "step": 726
+    },
+    {
+      "epoch": 0.12924444444444444,
+      "grad_norm": 0.38679690663668653,
+      "learning_rate": 0.00019488259168373197,
+      "loss": 0.753,
+      "step": 727
+    },
+    {
+      "epoch": 0.12942222222222222,
+      "grad_norm": 0.37928518286891644,
+      "learning_rate": 0.00019486439205402886,
+      "loss": 0.687,
+      "step": 728
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.38521234140186406,
+      "learning_rate": 0.0001948461609718933,
+      "loss": 0.6821,
+      "step": 729
+    },
+    {
+      "epoch": 0.12977777777777777,
+      "grad_norm": 0.3811765521477349,
+      "learning_rate": 0.0001948278984433699,
+      "loss": 0.6682,
+      "step": 730
+    },
+    {
+      "epoch": 0.12995555555555555,
+      "grad_norm": 0.3853373344282037,
+      "learning_rate": 0.00019480960447451352,
+      "loss": 0.7021,
+      "step": 731
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.3818534579985387,
+      "learning_rate": 0.00019479127907138968,
+      "loss": 0.6748,
+      "step": 732
+    },
+    {
+      "epoch": 0.1303111111111111,
+      "grad_norm": 0.3736128958052664,
+      "learning_rate": 0.0001947729222400741,
+      "loss": 0.6285,
+      "step": 733
+    },
+    {
+      "epoch": 0.13048888888888888,
+      "grad_norm": 0.4158871767940913,
+      "learning_rate": 0.00019475453398665307,
+      "loss": 0.7613,
+      "step": 734
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.4057020120832801,
+      "learning_rate": 0.0001947361143172232,
+      "loss": 0.7343,
+      "step": 735
+    },
+    {
+      "epoch": 0.13084444444444446,
+      "grad_norm": 0.4042423094083696,
+      "learning_rate": 0.00019471766323789162,
+      "loss": 0.6852,
+      "step": 736
+    },
+    {
+      "epoch": 0.13102222222222223,
+      "grad_norm": 0.4065871322352284,
+      "learning_rate": 0.00019469918075477573,
+      "loss": 0.753,
+      "step": 737
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4049602445920588,
+      "learning_rate": 0.0001946806668740035,
+      "loss": 0.7165,
+      "step": 738
+    },
+    {
+      "epoch": 0.1313777777777778,
+      "grad_norm": 0.36581838614638695,
+      "learning_rate": 0.00019466212160171322,
+      "loss": 0.7057,
+      "step": 739
+    },
+    {
+      "epoch": 0.13155555555555556,
+      "grad_norm": 0.35833651462384475,
+      "learning_rate": 0.00019464354494405357,
+      "loss": 0.6942,
+      "step": 740
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.36420006750116013,
+      "learning_rate": 0.0001946249369071837,
+      "loss": 0.6642,
+      "step": 741
+    },
+    {
+      "epoch": 0.13191111111111112,
+      "grad_norm": 0.3835386281989903,
+      "learning_rate": 0.0001946062974972731,
+      "loss": 0.6774,
+      "step": 742
+    },
+    {
+      "epoch": 0.1320888888888889,
+      "grad_norm": 0.418120550784053,
+      "learning_rate": 0.00019458762672050175,
+      "loss": 0.7305,
+      "step": 743
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.39042444598221515,
+      "learning_rate": 0.0001945689245830599,
+      "loss": 0.6663,
+      "step": 744
+    },
+    {
+      "epoch": 0.13244444444444445,
+      "grad_norm": 0.3948550958170412,
+      "learning_rate": 0.00019455019109114834,
+      "loss": 0.7107,
+      "step": 745
+    },
+    {
+      "epoch": 0.13262222222222222,
+      "grad_norm": 0.3770555486650651,
+      "learning_rate": 0.00019453142625097813,
+      "loss": 0.6831,
+      "step": 746
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.3562477937897721,
+      "learning_rate": 0.00019451263006877082,
+      "loss": 0.6525,
+      "step": 747
+    },
+    {
+      "epoch": 0.13297777777777778,
+      "grad_norm": 0.4170259799864657,
+      "learning_rate": 0.00019449380255075834,
+      "loss": 0.7611,
+      "step": 748
+    },
+    {
+      "epoch": 0.13315555555555555,
+      "grad_norm": 0.3541536601102192,
+      "learning_rate": 0.0001944749437031829,
+      "loss": 0.7023,
+      "step": 749
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.3705820712190629,
+      "learning_rate": 0.00019445605353229724,
+      "loss": 0.6944,
+      "step": 750
+    },
+    {
+      "epoch": 0.1335111111111111,
+      "grad_norm": 0.3729234298546076,
+      "learning_rate": 0.00019443713204436442,
+      "loss": 0.683,
+      "step": 751
+    },
+    {
+      "epoch": 0.13368888888888888,
+      "grad_norm": 0.3934317115739774,
+      "learning_rate": 0.00019441817924565786,
+      "loss": 0.7215,
+      "step": 752
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.3827409390397291,
+      "learning_rate": 0.00019439919514246143,
+      "loss": 0.683,
+      "step": 753
+    },
+    {
+      "epoch": 0.13404444444444444,
+      "grad_norm": 0.3655890153117205,
+      "learning_rate": 0.00019438017974106927,
+      "loss": 0.6904,
+      "step": 754
+    },
+    {
+      "epoch": 0.13422222222222221,
+      "grad_norm": 0.48897291399406645,
+      "learning_rate": 0.00019436113304778605,
+      "loss": 0.7228,
+      "step": 755
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.3964737782979409,
+      "learning_rate": 0.00019434205506892668,
+      "loss": 0.7232,
+      "step": 756
+    },
+    {
+      "epoch": 0.13457777777777777,
+      "grad_norm": 0.3747917375959724,
+      "learning_rate": 0.0001943229458108165,
+      "loss": 0.6475,
+      "step": 757
+    },
+    {
+      "epoch": 0.13475555555555555,
+      "grad_norm": 0.4084092421153487,
+      "learning_rate": 0.00019430380527979123,
+      "loss": 0.6853,
+      "step": 758
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.3954498153356176,
+      "learning_rate": 0.0001942846334821969,
+      "loss": 0.7067,
+      "step": 759
+    },
+    {
+      "epoch": 0.1351111111111111,
+      "grad_norm": 0.3923375746646342,
+      "learning_rate": 0.00019426543042438998,
+      "loss": 0.7349,
+      "step": 760
+    },
+    {
+      "epoch": 0.13528888888888888,
+      "grad_norm": 0.3846065996861539,
+      "learning_rate": 0.00019424619611273727,
+      "loss": 0.6407,
+      "step": 761
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.40324972054960234,
+      "learning_rate": 0.00019422693055361594,
+      "loss": 0.7002,
+      "step": 762
+    },
+    {
+      "epoch": 0.13564444444444446,
+      "grad_norm": 0.3809227825038652,
+      "learning_rate": 0.0001942076337534135,
+      "loss": 0.694,
+      "step": 763
+    },
+    {
+      "epoch": 0.13582222222222223,
+      "grad_norm": 0.39211048224653544,
+      "learning_rate": 0.00019418830571852786,
+      "loss": 0.6426,
+      "step": 764
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.38121732199107194,
+      "learning_rate": 0.00019416894645536722,
+      "loss": 0.6781,
+      "step": 765
+    },
+    {
+      "epoch": 0.1361777777777778,
+      "grad_norm": 0.39960463693666126,
+      "learning_rate": 0.0001941495559703502,
+      "loss": 0.6541,
+      "step": 766
+    },
+    {
+      "epoch": 0.13635555555555556,
+      "grad_norm": 0.39853418482635733,
+      "learning_rate": 0.00019413013426990573,
+      "loss": 0.7311,
+      "step": 767
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.37570928744363474,
+      "learning_rate": 0.0001941106813604731,
+      "loss": 0.6809,
+      "step": 768
+    },
+    {
+      "epoch": 0.13671111111111112,
+      "grad_norm": 0.37654309295921845,
+      "learning_rate": 0.00019409119724850203,
+      "loss": 0.6965,
+      "step": 769
+    },
+    {
+      "epoch": 0.1368888888888889,
+      "grad_norm": 0.37267827694515493,
+      "learning_rate": 0.0001940716819404524,
+      "loss": 0.6855,
+      "step": 770
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.41712734231261517,
+      "learning_rate": 0.00019405213544279457,
+      "loss": 0.7411,
+      "step": 771
+    },
+    {
+      "epoch": 0.13724444444444445,
+      "grad_norm": 0.37196299242904857,
+      "learning_rate": 0.00019403255776200923,
+      "loss": 0.6814,
+      "step": 772
+    },
+    {
+      "epoch": 0.13742222222222222,
+      "grad_norm": 0.3736471019310651,
+      "learning_rate": 0.0001940129489045874,
+      "loss": 0.66,
+      "step": 773
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4067752965062365,
+      "learning_rate": 0.00019399330887703037,
+      "loss": 0.6631,
+      "step": 774
+    },
+    {
+      "epoch": 0.13777777777777778,
+      "grad_norm": 0.41606622625490675,
+      "learning_rate": 0.00019397363768584985,
+      "loss": 0.6928,
+      "step": 775
+    },
+    {
+      "epoch": 0.13795555555555555,
+      "grad_norm": 0.37764784814524127,
+      "learning_rate": 0.00019395393533756791,
+      "loss": 0.6586,
+      "step": 776
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.37734096428376585,
+      "learning_rate": 0.00019393420183871682,
+      "loss": 0.6674,
+      "step": 777
+    },
+    {
+      "epoch": 0.1383111111111111,
+      "grad_norm": 0.488061776683202,
+      "learning_rate": 0.0001939144371958393,
+      "loss": 0.7338,
+      "step": 778
+    },
+    {
+      "epoch": 0.13848888888888888,
+      "grad_norm": 0.38761844001744455,
+      "learning_rate": 0.0001938946414154883,
+      "loss": 0.7043,
+      "step": 779
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.4212399759599747,
+      "learning_rate": 0.00019387481450422716,
+      "loss": 0.7298,
+      "step": 780
+    },
+    {
+      "epoch": 0.13884444444444444,
+      "grad_norm": 0.37564377307553876,
+      "learning_rate": 0.00019385495646862954,
+      "loss": 0.7146,
+      "step": 781
+    },
+    {
+      "epoch": 0.1390222222222222,
+      "grad_norm": 0.3613959363788971,
+      "learning_rate": 0.00019383506731527936,
+      "loss": 0.6855,
+      "step": 782
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.38638494445979815,
+      "learning_rate": 0.00019381514705077096,
+      "loss": 0.7322,
+      "step": 783
+    },
+    {
+      "epoch": 0.13937777777777777,
+      "grad_norm": 0.3848478283677541,
+      "learning_rate": 0.00019379519568170887,
+      "loss": 0.7238,
+      "step": 784
+    },
+    {
+      "epoch": 0.13955555555555554,
+      "grad_norm": 0.37843001759067885,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 0.6246,
+      "step": 785
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.39755090105671587,
+      "learning_rate": 0.00019375519965639368,
+      "loss": 0.7072,
+      "step": 786
+    },
+    {
+      "epoch": 0.13991111111111112,
+      "grad_norm": 0.4177712000107054,
+      "learning_rate": 0.0001937351550134013,
+      "loss": 0.7199,
+      "step": 787
+    },
+    {
+      "epoch": 0.1400888888888889,
+      "grad_norm": 0.3866941433560375,
+      "learning_rate": 0.00019371507929237677,
+      "loss": 0.7334,
+      "step": 788
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.41247139332204347,
+      "learning_rate": 0.0001936949724999762,
+      "loss": 0.7428,
+      "step": 789
+    },
+    {
+      "epoch": 0.14044444444444446,
+      "grad_norm": 0.39944174506901275,
+      "learning_rate": 0.000193674834642866,
+      "loss": 0.6785,
+      "step": 790
+    },
+    {
+      "epoch": 0.14062222222222223,
+      "grad_norm": 0.4073098744839168,
+      "learning_rate": 0.0001936546657277229,
+      "loss": 0.6525,
+      "step": 791
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.35203231595843426,
+      "learning_rate": 0.00019363446576123403,
+      "loss": 0.6599,
+      "step": 792
+    },
+    {
+      "epoch": 0.14097777777777779,
+      "grad_norm": 0.39085949367200534,
+      "learning_rate": 0.0001936142347500966,
+      "loss": 0.7578,
+      "step": 793
+    },
+    {
+      "epoch": 0.14115555555555556,
+      "grad_norm": 0.3853118606301283,
+      "learning_rate": 0.00019359397270101832,
+      "loss": 0.7415,
+      "step": 794
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.36913894834976235,
+      "learning_rate": 0.0001935736796207171,
+      "loss": 0.6489,
+      "step": 795
+    },
+    {
+      "epoch": 0.14151111111111112,
+      "grad_norm": 0.3716800590217003,
+      "learning_rate": 0.00019355335551592105,
+      "loss": 0.6131,
+      "step": 796
+    },
+    {
+      "epoch": 0.1416888888888889,
+      "grad_norm": 0.37141903092957784,
+      "learning_rate": 0.00019353300039336873,
+      "loss": 0.688,
+      "step": 797
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.3720260022775563,
+      "learning_rate": 0.00019351261425980894,
+      "loss": 0.6668,
+      "step": 798
+    },
+    {
+      "epoch": 0.14204444444444445,
+      "grad_norm": 0.40464753411058213,
+      "learning_rate": 0.00019349219712200063,
+      "loss": 0.7121,
+      "step": 799
+    },
+    {
+      "epoch": 0.14222222222222222,
+      "grad_norm": 0.4031291162387869,
+      "learning_rate": 0.00019347174898671324,
+      "loss": 0.7358,
+      "step": 800
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.3747899842991318,
+      "learning_rate": 0.00019345126986072635,
+      "loss": 0.681,
+      "step": 801
+    },
+    {
+      "epoch": 0.14257777777777778,
+      "grad_norm": 0.3911511727555308,
+      "learning_rate": 0.00019343075975082982,
+      "loss": 0.6956,
+      "step": 802
+    },
+    {
+      "epoch": 0.14275555555555555,
+      "grad_norm": 0.3778862854504106,
+      "learning_rate": 0.00019341021866382382,
+      "loss": 0.6943,
+      "step": 803
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.3982973493657767,
+      "learning_rate": 0.00019338964660651876,
+      "loss": 0.7067,
+      "step": 804
+    },
+    {
+      "epoch": 0.1431111111111111,
+      "grad_norm": 0.3845951950334205,
+      "learning_rate": 0.00019336904358573537,
+      "loss": 0.6442,
+      "step": 805
+    },
+    {
+      "epoch": 0.14328888888888888,
+      "grad_norm": 0.3779646103778523,
+      "learning_rate": 0.0001933484096083046,
+      "loss": 0.6812,
+      "step": 806
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.37724722844955266,
+      "learning_rate": 0.00019332774468106768,
+      "loss": 0.6894,
+      "step": 807
+    },
+    {
+      "epoch": 0.14364444444444444,
+      "grad_norm": 0.3970349948910945,
+      "learning_rate": 0.0001933070488108761,
+      "loss": 0.7438,
+      "step": 808
+    },
+    {
+      "epoch": 0.1438222222222222,
+      "grad_norm": 0.38646658897235914,
+      "learning_rate": 0.00019328632200459156,
+      "loss": 0.6343,
+      "step": 809
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3725777771171682,
+      "learning_rate": 0.00019326556426908613,
+      "loss": 0.6869,
+      "step": 810
+    },
+    {
+      "epoch": 0.14417777777777777,
+      "grad_norm": 0.3967025788249028,
+      "learning_rate": 0.00019324477561124206,
+      "loss": 0.7214,
+      "step": 811
+    },
+    {
+      "epoch": 0.14435555555555554,
+      "grad_norm": 0.3609963595961742,
+      "learning_rate": 0.0001932239560379518,
+      "loss": 0.6393,
+      "step": 812
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.40156946299695406,
+      "learning_rate": 0.00019320310555611818,
+      "loss": 0.7506,
+      "step": 813
+    },
+    {
+      "epoch": 0.14471111111111112,
+      "grad_norm": 0.3830502972541065,
+      "learning_rate": 0.0001931822241726542,
+      "loss": 0.7175,
+      "step": 814
+    },
+    {
+      "epoch": 0.1448888888888889,
+      "grad_norm": 0.38994602022279967,
+      "learning_rate": 0.00019316131189448305,
+      "loss": 0.6981,
+      "step": 815
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.361486592054618,
+      "learning_rate": 0.0001931403687285383,
+      "loss": 0.6605,
+      "step": 816
+    },
+    {
+      "epoch": 0.14524444444444445,
+      "grad_norm": 0.3721825291843033,
+      "learning_rate": 0.00019311939468176368,
+      "loss": 0.6725,
+      "step": 817
+    },
+    {
+      "epoch": 0.14542222222222223,
+      "grad_norm": 0.3702339318523388,
+      "learning_rate": 0.00019309838976111311,
+      "loss": 0.6771,
+      "step": 818
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.3655943655108182,
+      "learning_rate": 0.00019307735397355088,
+      "loss": 0.6696,
+      "step": 819
+    },
+    {
+      "epoch": 0.14577777777777778,
+      "grad_norm": 0.3832003903881648,
+      "learning_rate": 0.00019305628732605137,
+      "loss": 0.7332,
+      "step": 820
+    },
+    {
+      "epoch": 0.14595555555555556,
+      "grad_norm": 0.37917627423343236,
+      "learning_rate": 0.00019303518982559932,
+      "loss": 0.717,
+      "step": 821
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.4024645477481322,
+      "learning_rate": 0.00019301406147918956,
+      "loss": 0.648,
+      "step": 822
+    },
+    {
+      "epoch": 0.14631111111111111,
+      "grad_norm": 0.38746880739552597,
+      "learning_rate": 0.0001929929022938273,
+      "loss": 0.6984,
+      "step": 823
+    },
+    {
+      "epoch": 0.1464888888888889,
+      "grad_norm": 0.3827970535115837,
+      "learning_rate": 0.00019297171227652786,
+      "loss": 0.6789,
+      "step": 824
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.40500596869525096,
+      "learning_rate": 0.00019295049143431685,
+      "loss": 0.7277,
+      "step": 825
+    },
+    {
+      "epoch": 0.14684444444444444,
+      "grad_norm": 0.37882096104908486,
+      "learning_rate": 0.00019292923977423006,
+      "loss": 0.6711,
+      "step": 826
+    },
+    {
+      "epoch": 0.14702222222222222,
+      "grad_norm": 0.4146687016604997,
+      "learning_rate": 0.0001929079573033135,
+      "loss": 0.7462,
+      "step": 827
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.38815996077635145,
+      "learning_rate": 0.0001928866440286234,
+      "loss": 0.662,
+      "step": 828
+    },
+    {
+      "epoch": 0.14737777777777777,
+      "grad_norm": 0.40178896846691553,
+      "learning_rate": 0.00019286529995722623,
+      "loss": 0.7094,
+      "step": 829
+    },
+    {
+      "epoch": 0.14755555555555555,
+      "grad_norm": 0.3623127536712287,
+      "learning_rate": 0.00019284392509619864,
+      "loss": 0.6425,
+      "step": 830
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.4051273563935649,
+      "learning_rate": 0.00019282251945262747,
+      "loss": 0.6766,
+      "step": 831
+    },
+    {
+      "epoch": 0.1479111111111111,
+      "grad_norm": 0.4040610098428571,
+      "learning_rate": 0.00019280108303360987,
+      "loss": 0.6786,
+      "step": 832
+    },
+    {
+      "epoch": 0.14808888888888888,
+      "grad_norm": 0.3902912390290996,
+      "learning_rate": 0.00019277961584625303,
+      "loss": 0.6942,
+      "step": 833
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.3738145449037745,
+      "learning_rate": 0.00019275811789767447,
+      "loss": 0.7028,
+      "step": 834
+    },
+    {
+      "epoch": 0.14844444444444443,
+      "grad_norm": 0.3713233104536498,
+      "learning_rate": 0.00019273658919500186,
+      "loss": 0.6874,
+      "step": 835
+    },
+    {
+      "epoch": 0.1486222222222222,
+      "grad_norm": 0.379256933973657,
+      "learning_rate": 0.0001927150297453731,
+      "loss": 0.6734,
+      "step": 836
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.36501729827377855,
+      "learning_rate": 0.00019269343955593618,
+      "loss": 0.6887,
+      "step": 837
+    },
+    {
+      "epoch": 0.14897777777777776,
+      "grad_norm": 0.3914571055470754,
+      "learning_rate": 0.00019267181863384946,
+      "loss": 0.6912,
+      "step": 838
+    },
+    {
+      "epoch": 0.14915555555555557,
+      "grad_norm": 0.35502848462922065,
+      "learning_rate": 0.00019265016698628132,
+      "loss": 0.6771,
+      "step": 839
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.3574149592775559,
+      "learning_rate": 0.00019262848462041045,
+      "loss": 0.6848,
+      "step": 840
+    },
+    {
+      "epoch": 0.14951111111111112,
+      "grad_norm": 0.3749511200730551,
+      "learning_rate": 0.00019260677154342564,
+      "loss": 0.7516,
+      "step": 841
+    },
+    {
+      "epoch": 0.1496888888888889,
+      "grad_norm": 0.39746920064342706,
+      "learning_rate": 0.00019258502776252587,
+      "loss": 0.6851,
+      "step": 842
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.3995760780095097,
+      "learning_rate": 0.0001925632532849204,
+      "loss": 0.7215,
+      "step": 843
+    },
+    {
+      "epoch": 0.15004444444444445,
+      "grad_norm": 0.39485424985504264,
+      "learning_rate": 0.00019254144811782845,
+      "loss": 0.7248,
+      "step": 844
+    },
+    {
+      "epoch": 0.15022222222222223,
+      "grad_norm": 0.3875916817580916,
+      "learning_rate": 0.0001925196122684797,
+      "loss": 0.6821,
+      "step": 845
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3366175677715393,
+      "learning_rate": 0.0001924977457441138,
+      "loss": 0.6446,
+      "step": 846
+    },
+    {
+      "epoch": 0.15057777777777778,
+      "grad_norm": 0.40719045534080944,
+      "learning_rate": 0.00019247584855198064,
+      "loss": 0.6923,
+      "step": 847
+    },
+    {
+      "epoch": 0.15075555555555556,
+      "grad_norm": 0.3905962557102382,
+      "learning_rate": 0.00019245392069934024,
+      "loss": 0.767,
+      "step": 848
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.3717944052657077,
+      "learning_rate": 0.00019243196219346283,
+      "loss": 0.6952,
+      "step": 849
+    },
+    {
+      "epoch": 0.1511111111111111,
+      "grad_norm": 0.3867249130380458,
+      "learning_rate": 0.0001924099730416288,
+      "loss": 0.6647,
+      "step": 850
+    },
+    {
+      "epoch": 0.1512888888888889,
+      "grad_norm": 0.40528450547748174,
+      "learning_rate": 0.0001923879532511287,
+      "loss": 0.7014,
+      "step": 851
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.3782710266019561,
+      "learning_rate": 0.00019236590282926318,
+      "loss": 0.657,
+      "step": 852
+    },
+    {
+      "epoch": 0.15164444444444444,
+      "grad_norm": 0.40216110744235556,
+      "learning_rate": 0.0001923438217833431,
+      "loss": 0.7599,
+      "step": 853
+    },
+    {
+      "epoch": 0.15182222222222222,
+      "grad_norm": 0.3804514119338972,
+      "learning_rate": 0.00019232171012068948,
+      "loss": 0.7034,
+      "step": 854
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.373251838160946,
+      "learning_rate": 0.00019229956784863345,
+      "loss": 0.6598,
+      "step": 855
+    },
+    {
+      "epoch": 0.15217777777777777,
+      "grad_norm": 0.3969445549142826,
+      "learning_rate": 0.00019227739497451637,
+      "loss": 0.7327,
+      "step": 856
+    },
+    {
+      "epoch": 0.15235555555555555,
+      "grad_norm": 0.39468893924795007,
+      "learning_rate": 0.00019225519150568965,
+      "loss": 0.7299,
+      "step": 857
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.3839251712329289,
+      "learning_rate": 0.00019223295744951485,
+      "loss": 0.7275,
+      "step": 858
+    },
+    {
+      "epoch": 0.1527111111111111,
+      "grad_norm": 0.38860140658945475,
+      "learning_rate": 0.00019221069281336378,
+      "loss": 0.7066,
+      "step": 859
+    },
+    {
+      "epoch": 0.15288888888888888,
+      "grad_norm": 0.3564378114831085,
+      "learning_rate": 0.00019218839760461827,
+      "loss": 0.6811,
+      "step": 860
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.3994409828105225,
+      "learning_rate": 0.00019216607183067033,
+      "loss": 0.7238,
+      "step": 861
+    },
+    {
+      "epoch": 0.15324444444444443,
+      "grad_norm": 0.38267358680000324,
+      "learning_rate": 0.0001921437154989221,
+      "loss": 0.6866,
+      "step": 862
+    },
+    {
+      "epoch": 0.1534222222222222,
+      "grad_norm": 0.3787665686182062,
+      "learning_rate": 0.00019212132861678587,
+      "loss": 0.6758,
+      "step": 863
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.3968012321798156,
+      "learning_rate": 0.00019209891119168404,
+      "loss": 0.7525,
+      "step": 864
+    },
+    {
+      "epoch": 0.1537777777777778,
+      "grad_norm": 0.3755118168875679,
+      "learning_rate": 0.00019207646323104915,
+      "loss": 0.6768,
+      "step": 865
+    },
+    {
+      "epoch": 0.15395555555555557,
+      "grad_norm": 0.37239361100877194,
+      "learning_rate": 0.00019205398474232384,
+      "loss": 0.693,
+      "step": 866
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.39339555350299876,
+      "learning_rate": 0.0001920314757329609,
+      "loss": 0.7043,
+      "step": 867
+    },
+    {
+      "epoch": 0.15431111111111112,
+      "grad_norm": 0.3842754898541647,
+      "learning_rate": 0.00019200893621042323,
+      "loss": 0.6884,
+      "step": 868
+    },
+    {
+      "epoch": 0.1544888888888889,
+      "grad_norm": 0.3661887441910969,
+      "learning_rate": 0.0001919863661821838,
+      "loss": 0.6418,
+      "step": 869
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.40312379655848163,
+      "learning_rate": 0.00019196376565572577,
+      "loss": 0.7169,
+      "step": 870
+    },
+    {
+      "epoch": 0.15484444444444445,
+      "grad_norm": 0.3691160143514685,
+      "learning_rate": 0.00019194113463854242,
+      "loss": 0.6753,
+      "step": 871
+    },
+    {
+      "epoch": 0.15502222222222223,
+      "grad_norm": 0.39931681001637737,
+      "learning_rate": 0.00019191847313813703,
+      "loss": 0.6856,
+      "step": 872
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.38596794025358083,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 0.7086,
+      "step": 873
+    },
+    {
+      "epoch": 0.15537777777777778,
+      "grad_norm": 0.36835465349722907,
+      "learning_rate": 0.0001918730587177241,
+      "loss": 0.674,
+      "step": 874
+    },
+    {
+      "epoch": 0.15555555555555556,
+      "grad_norm": 0.3682904690381493,
+      "learning_rate": 0.00019185030581277384,
+      "loss": 0.6113,
+      "step": 875
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.37647941723590267,
+      "learning_rate": 0.00019182752245471596,
+      "loss": 0.756,
+      "step": 876
+    },
+    {
+      "epoch": 0.1559111111111111,
+      "grad_norm": 0.3824400362358979,
+      "learning_rate": 0.00019180470865110436,
+      "loss": 0.6926,
+      "step": 877
+    },
+    {
+      "epoch": 0.1560888888888889,
+      "grad_norm": 0.3706013649073851,
+      "learning_rate": 0.000191781864409503,
+      "loss": 0.6355,
+      "step": 878
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.36518079225092087,
+      "learning_rate": 0.00019175898973748589,
+      "loss": 0.7308,
+      "step": 879
+    },
+    {
+      "epoch": 0.15644444444444444,
+      "grad_norm": 0.3803257423170877,
+      "learning_rate": 0.00019173608464263721,
+      "loss": 0.7124,
+      "step": 880
+    },
+    {
+      "epoch": 0.15662222222222222,
+      "grad_norm": 0.37408064422087633,
+      "learning_rate": 0.00019171314913255113,
+      "loss": 0.67,
+      "step": 881
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.41669295893778413,
+      "learning_rate": 0.00019169018321483198,
+      "loss": 0.738,
+      "step": 882
+    },
+    {
+      "epoch": 0.15697777777777777,
+      "grad_norm": 0.3915679222832886,
+      "learning_rate": 0.00019166718689709415,
+      "loss": 0.6898,
+      "step": 883
+    },
+    {
+      "epoch": 0.15715555555555555,
+      "grad_norm": 0.3714762991721066,
+      "learning_rate": 0.00019164416018696207,
+      "loss": 0.6736,
+      "step": 884
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.38048514131328887,
+      "learning_rate": 0.00019162110309207034,
+      "loss": 0.6572,
+      "step": 885
+    },
+    {
+      "epoch": 0.1575111111111111,
+      "grad_norm": 0.3957511094551649,
+      "learning_rate": 0.0001915980156200635,
+      "loss": 0.7551,
+      "step": 886
+    },
+    {
+      "epoch": 0.15768888888888888,
+      "grad_norm": 0.3749959449674772,
+      "learning_rate": 0.0001915748977785963,
+      "loss": 0.6837,
+      "step": 887
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.3708924692046955,
+      "learning_rate": 0.00019155174957533343,
+      "loss": 0.6334,
+      "step": 888
+    },
+    {
+      "epoch": 0.15804444444444443,
+      "grad_norm": 0.3723815872834324,
+      "learning_rate": 0.00019152857101794978,
+      "loss": 0.6853,
+      "step": 889
+    },
+    {
+      "epoch": 0.1582222222222222,
+      "grad_norm": 0.3574084991834564,
+      "learning_rate": 0.00019150536211413023,
+      "loss": 0.6665,
+      "step": 890
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.37994981717056847,
+      "learning_rate": 0.00019148212287156967,
+      "loss": 0.6421,
+      "step": 891
+    },
+    {
+      "epoch": 0.1585777777777778,
+      "grad_norm": 0.3719690546774784,
+      "learning_rate": 0.00019145885329797317,
+      "loss": 0.6532,
+      "step": 892
+    },
+    {
+      "epoch": 0.15875555555555557,
+      "grad_norm": 0.3948652525477549,
+      "learning_rate": 0.00019143555340105572,
+      "loss": 0.7026,
+      "step": 893
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.39236242936492893,
+      "learning_rate": 0.0001914122231885425,
+      "loss": 0.7022,
+      "step": 894
+    },
+    {
+      "epoch": 0.15911111111111112,
+      "grad_norm": 0.38184154499779155,
+      "learning_rate": 0.00019138886266816866,
+      "loss": 0.7161,
+      "step": 895
+    },
+    {
+      "epoch": 0.1592888888888889,
+      "grad_norm": 0.3773011500516085,
+      "learning_rate": 0.00019136547184767943,
+      "loss": 0.6643,
+      "step": 896
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.367319783999322,
+      "learning_rate": 0.00019134205073483002,
+      "loss": 0.6633,
+      "step": 897
+    },
+    {
+      "epoch": 0.15964444444444445,
+      "grad_norm": 0.3676518190361671,
+      "learning_rate": 0.0001913185993373858,
+      "loss": 0.6798,
+      "step": 898
+    },
+    {
+      "epoch": 0.15982222222222223,
+      "grad_norm": 0.40317481122827503,
+      "learning_rate": 0.00019129511766312205,
+      "loss": 0.7495,
+      "step": 899
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3797450239775051,
+      "learning_rate": 0.0001912716057198242,
+      "loss": 0.6783,
+      "step": 900
+    },
+    {
+      "epoch": 0.16017777777777778,
+      "grad_norm": 0.3666360257069798,
+      "learning_rate": 0.00019124806351528766,
+      "loss": 0.6667,
+      "step": 901
+    },
+    {
+      "epoch": 0.16035555555555556,
+      "grad_norm": 0.37469060589716036,
+      "learning_rate": 0.0001912244910573179,
+      "loss": 0.6894,
+      "step": 902
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.3856385979008903,
+      "learning_rate": 0.00019120088835373038,
+      "loss": 0.6585,
+      "step": 903
+    },
+    {
+      "epoch": 0.1607111111111111,
+      "grad_norm": 0.37598435152839466,
+      "learning_rate": 0.00019117725541235061,
+      "loss": 0.67,
+      "step": 904
+    },
+    {
+      "epoch": 0.1608888888888889,
+      "grad_norm": 0.36572523451244027,
+      "learning_rate": 0.00019115359224101416,
+      "loss": 0.7003,
+      "step": 905
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.38165170390812214,
+      "learning_rate": 0.00019112989884756653,
+      "loss": 0.7348,
+      "step": 906
+    },
+    {
+      "epoch": 0.16124444444444444,
+      "grad_norm": 0.36227885276166877,
+      "learning_rate": 0.00019110617523986333,
+      "loss": 0.7095,
+      "step": 907
+    },
+    {
+      "epoch": 0.16142222222222222,
+      "grad_norm": 0.36840554757525734,
+      "learning_rate": 0.00019108242142577023,
+      "loss": 0.6283,
+      "step": 908
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.37681657881387676,
+      "learning_rate": 0.0001910586374131627,
+      "loss": 0.672,
+      "step": 909
+    },
+    {
+      "epoch": 0.16177777777777777,
+      "grad_norm": 0.3589724367086767,
+      "learning_rate": 0.00019103482320992647,
+      "loss": 0.6667,
+      "step": 910
+    },
+    {
+      "epoch": 0.16195555555555555,
+      "grad_norm": 0.3633758066410066,
+      "learning_rate": 0.00019101097882395717,
+      "loss": 0.7471,
+      "step": 911
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.3714861769343233,
+      "learning_rate": 0.0001909871042631604,
+      "loss": 0.7148,
+      "step": 912
+    },
+    {
+      "epoch": 0.1623111111111111,
+      "grad_norm": 0.38220576216861196,
+      "learning_rate": 0.00019096319953545185,
+      "loss": 0.6937,
+      "step": 913
+    },
+    {
+      "epoch": 0.16248888888888888,
+      "grad_norm": 0.34252174084917913,
+      "learning_rate": 0.00019093926464875714,
+      "loss": 0.627,
+      "step": 914
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.3910193525377364,
+      "learning_rate": 0.00019091529961101191,
+      "loss": 0.674,
+      "step": 915
+    },
+    {
+      "epoch": 0.16284444444444446,
+      "grad_norm": 0.37212335353670445,
+      "learning_rate": 0.00019089130443016182,
+      "loss": 0.6672,
+      "step": 916
+    },
+    {
+      "epoch": 0.16302222222222224,
+      "grad_norm": 0.3723265939088728,
+      "learning_rate": 0.0001908672791141625,
+      "loss": 0.6823,
+      "step": 917
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3980200236736282,
+      "learning_rate": 0.0001908432236709796,
+      "loss": 0.6584,
+      "step": 918
+    },
+    {
+      "epoch": 0.1633777777777778,
+      "grad_norm": 0.36951075328814814,
+      "learning_rate": 0.00019081913810858872,
+      "loss": 0.6336,
+      "step": 919
+    },
+    {
+      "epoch": 0.16355555555555557,
+      "grad_norm": 0.3681292106163915,
+      "learning_rate": 0.00019079502243497546,
+      "loss": 0.6975,
+      "step": 920
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.36736987542538535,
+      "learning_rate": 0.00019077087665813545,
+      "loss": 0.6912,
+      "step": 921
+    },
+    {
+      "epoch": 0.16391111111111112,
+      "grad_norm": 0.3953614753595474,
+      "learning_rate": 0.00019074670078607418,
+      "loss": 0.7226,
+      "step": 922
+    },
+    {
+      "epoch": 0.1640888888888889,
+      "grad_norm": 0.3842973294649797,
+      "learning_rate": 0.00019072249482680726,
+      "loss": 0.7125,
+      "step": 923
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.3505606297193894,
+      "learning_rate": 0.0001906982587883602,
+      "loss": 0.6727,
+      "step": 924
+    },
+    {
+      "epoch": 0.16444444444444445,
+      "grad_norm": 0.3632278428872631,
+      "learning_rate": 0.00019067399267876849,
+      "loss": 0.6996,
+      "step": 925
+    },
+    {
+      "epoch": 0.16462222222222223,
+      "grad_norm": 0.3545503866907047,
+      "learning_rate": 0.0001906496965060776,
+      "loss": 0.6641,
+      "step": 926
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.3632330341334364,
+      "learning_rate": 0.00019062537027834297,
+      "loss": 0.6547,
+      "step": 927
+    },
+    {
+      "epoch": 0.16497777777777778,
+      "grad_norm": 0.38323046059134197,
+      "learning_rate": 0.00019060101400362998,
+      "loss": 0.6662,
+      "step": 928
+    },
+    {
+      "epoch": 0.16515555555555556,
+      "grad_norm": 0.39803430647266524,
+      "learning_rate": 0.00019057662769001395,
+      "loss": 0.6472,
+      "step": 929
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.3762222200575266,
+      "learning_rate": 0.0001905522113455803,
+      "loss": 0.6311,
+      "step": 930
+    },
+    {
+      "epoch": 0.1655111111111111,
+      "grad_norm": 0.3611497893779868,
+      "learning_rate": 0.00019052776497842423,
+      "loss": 0.6708,
+      "step": 931
+    },
+    {
+      "epoch": 0.16568888888888889,
+      "grad_norm": 0.3633431135172028,
+      "learning_rate": 0.000190503288596651,
+      "loss": 0.6384,
+      "step": 932
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.36716431339514577,
+      "learning_rate": 0.00019047878220837576,
+      "loss": 0.6679,
+      "step": 933
+    },
+    {
+      "epoch": 0.16604444444444444,
+      "grad_norm": 0.3761611957212563,
+      "learning_rate": 0.00019045424582172368,
+      "loss": 0.6928,
+      "step": 934
+    },
+    {
+      "epoch": 0.16622222222222222,
+      "grad_norm": 0.36756920537908794,
+      "learning_rate": 0.00019042967944482981,
+      "loss": 0.7036,
+      "step": 935
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3705242573861348,
+      "learning_rate": 0.00019040508308583917,
+      "loss": 0.6677,
+      "step": 936
+    },
+    {
+      "epoch": 0.16657777777777777,
+      "grad_norm": 0.3476817772622256,
+      "learning_rate": 0.00019038045675290674,
+      "loss": 0.6404,
+      "step": 937
+    },
+    {
+      "epoch": 0.16675555555555555,
+      "grad_norm": 0.36164452836198513,
+      "learning_rate": 0.0001903558004541974,
+      "loss": 0.6751,
+      "step": 938
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.3613310282905976,
+      "learning_rate": 0.00019033111419788597,
+      "loss": 0.678,
+      "step": 939
+    },
+    {
+      "epoch": 0.1671111111111111,
+      "grad_norm": 0.4002764076703055,
+      "learning_rate": 0.00019030639799215727,
+      "loss": 0.6583,
+      "step": 940
+    },
+    {
+      "epoch": 0.16728888888888888,
+      "grad_norm": 0.383884582251674,
+      "learning_rate": 0.00019028165184520598,
+      "loss": 0.7168,
+      "step": 941
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.3513166988467683,
+      "learning_rate": 0.00019025687576523662,
+      "loss": 0.6887,
+      "step": 942
+    },
+    {
+      "epoch": 0.16764444444444446,
+      "grad_norm": 0.3712351944457513,
+      "learning_rate": 0.00019023206976046388,
+      "loss": 0.6896,
+      "step": 943
+    },
+    {
+      "epoch": 0.16782222222222223,
+      "grad_norm": 0.3548838696930136,
+      "learning_rate": 0.00019020723383911215,
+      "loss": 0.7209,
+      "step": 944
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.3609831047187673,
+      "learning_rate": 0.00019018236800941586,
+      "loss": 0.6913,
+      "step": 945
+    },
+    {
+      "epoch": 0.1681777777777778,
+      "grad_norm": 0.38494901957599786,
+      "learning_rate": 0.00019015747227961924,
+      "loss": 0.6726,
+      "step": 946
+    },
+    {
+      "epoch": 0.16835555555555556,
+      "grad_norm": 0.376045438024239,
+      "learning_rate": 0.00019013254665797656,
+      "loss": 0.7049,
+      "step": 947
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.38693239070360796,
+      "learning_rate": 0.00019010759115275198,
+      "loss": 0.7093,
+      "step": 948
+    },
+    {
+      "epoch": 0.16871111111111112,
+      "grad_norm": 0.39279322099221703,
+      "learning_rate": 0.00019008260577221947,
+      "loss": 0.7007,
+      "step": 949
+    },
+    {
+      "epoch": 0.1688888888888889,
+      "grad_norm": 0.36528657687701077,
+      "learning_rate": 0.000190057590524663,
+      "loss": 0.6862,
+      "step": 950
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.4067825862277342,
+      "learning_rate": 0.0001900325454183764,
+      "loss": 0.7108,
+      "step": 951
+    },
+    {
+      "epoch": 0.16924444444444445,
+      "grad_norm": 0.3641457720727108,
+      "learning_rate": 0.00019000747046166345,
+      "loss": 0.6795,
+      "step": 952
+    },
+    {
+      "epoch": 0.16942222222222222,
+      "grad_norm": 0.38005048120742807,
+      "learning_rate": 0.00018998236566283774,
+      "loss": 0.7111,
+      "step": 953
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.40237289294467093,
+      "learning_rate": 0.00018995723103022285,
+      "loss": 0.6531,
+      "step": 954
+    },
+    {
+      "epoch": 0.16977777777777778,
+      "grad_norm": 0.3828704240170546,
+      "learning_rate": 0.00018993206657215214,
+      "loss": 0.664,
+      "step": 955
+    },
+    {
+      "epoch": 0.16995555555555555,
+      "grad_norm": 0.3707010162882291,
+      "learning_rate": 0.00018990687229696903,
+      "loss": 0.6801,
+      "step": 956
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.3766939634441239,
+      "learning_rate": 0.0001898816482130266,
+      "loss": 0.663,
+      "step": 957
+    },
+    {
+      "epoch": 0.1703111111111111,
+      "grad_norm": 0.3875508305819283,
+      "learning_rate": 0.000189856394328688,
+      "loss": 0.66,
+      "step": 958
+    },
+    {
+      "epoch": 0.17048888888888888,
+      "grad_norm": 0.39607092680599376,
+      "learning_rate": 0.0001898311106523262,
+      "loss": 0.6969,
+      "step": 959
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.3748046646768422,
+      "learning_rate": 0.00018980579719232404,
+      "loss": 0.7157,
+      "step": 960
+    },
+    {
+      "epoch": 0.17084444444444444,
+      "grad_norm": 0.3550300218043015,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 0.6794,
+      "step": 961
+    },
+    {
+      "epoch": 0.17102222222222221,
+      "grad_norm": 0.39915666639760633,
+      "learning_rate": 0.00018975508095497924,
+      "loss": 0.6239,
+      "step": 962
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.39528568088530114,
+      "learning_rate": 0.00018972967819445174,
+      "loss": 0.722,
+      "step": 963
+    },
+    {
+      "epoch": 0.17137777777777777,
+      "grad_norm": 0.42818435107281805,
+      "learning_rate": 0.0001897042456839139,
+      "loss": 0.7361,
+      "step": 964
+    },
+    {
+      "epoch": 0.17155555555555554,
+      "grad_norm": 0.3770331825732747,
+      "learning_rate": 0.000189678783431798,
+      "loss": 0.6903,
+      "step": 965
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.3690606710884741,
+      "learning_rate": 0.000189653291446546,
+      "loss": 0.6654,
+      "step": 966
+    },
+    {
+      "epoch": 0.1719111111111111,
+      "grad_norm": 0.3531078932464131,
+      "learning_rate": 0.00018962776973660987,
+      "loss": 0.6985,
+      "step": 967
+    },
+    {
+      "epoch": 0.1720888888888889,
+      "grad_norm": 0.37621156797683303,
+      "learning_rate": 0.00018960221831045137,
+      "loss": 0.6532,
+      "step": 968
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.36891033017600644,
+      "learning_rate": 0.00018957663717654208,
+      "loss": 0.6621,
+      "step": 969
+    },
+    {
+      "epoch": 0.17244444444444446,
+      "grad_norm": 0.3706211159271918,
+      "learning_rate": 0.00018955102634336346,
+      "loss": 0.68,
+      "step": 970
+    },
+    {
+      "epoch": 0.17262222222222223,
+      "grad_norm": 0.3501615441517381,
+      "learning_rate": 0.00018952538581940687,
+      "loss": 0.6794,
+      "step": 971
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.38998123444222355,
+      "learning_rate": 0.0001894997156131734,
+      "loss": 0.6217,
+      "step": 972
+    },
+    {
+      "epoch": 0.17297777777777779,
+      "grad_norm": 0.395084525965259,
+      "learning_rate": 0.00018947401573317412,
+      "loss": 0.6329,
+      "step": 973
+    },
+    {
+      "epoch": 0.17315555555555556,
+      "grad_norm": 0.44863575803275124,
+      "learning_rate": 0.0001894482861879298,
+      "loss": 0.7058,
+      "step": 974
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.389612453394495,
+      "learning_rate": 0.00018942252698597113,
+      "loss": 0.6733,
+      "step": 975
+    },
+    {
+      "epoch": 0.17351111111111112,
+      "grad_norm": 0.4150308547143673,
+      "learning_rate": 0.00018939673813583863,
+      "loss": 0.718,
+      "step": 976
+    },
+    {
+      "epoch": 0.1736888888888889,
+      "grad_norm": 0.37633665114754683,
+      "learning_rate": 0.00018937091964608263,
+      "loss": 0.6395,
+      "step": 977
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.36311496791114056,
+      "learning_rate": 0.00018934507152526325,
+      "loss": 0.6534,
+      "step": 978
+    },
+    {
+      "epoch": 0.17404444444444445,
+      "grad_norm": 0.3742321074084477,
+      "learning_rate": 0.00018931919378195052,
+      "loss": 0.6848,
+      "step": 979
+    },
+    {
+      "epoch": 0.17422222222222222,
+      "grad_norm": 0.36516688034793926,
+      "learning_rate": 0.00018929328642472418,
+      "loss": 0.6838,
+      "step": 980
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.3752231104254026,
+      "learning_rate": 0.00018926734946217395,
+      "loss": 0.6907,
+      "step": 981
+    },
+    {
+      "epoch": 0.17457777777777778,
+      "grad_norm": 0.3654848572864537,
+      "learning_rate": 0.0001892413829028992,
+      "loss": 0.6774,
+      "step": 982
+    },
+    {
+      "epoch": 0.17475555555555555,
+      "grad_norm": 0.3718312413505301,
+      "learning_rate": 0.0001892153867555092,
+      "loss": 0.7198,
+      "step": 983
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.4503316142695314,
+      "learning_rate": 0.00018918936102862302,
+      "loss": 0.6622,
+      "step": 984
+    },
+    {
+      "epoch": 0.1751111111111111,
+      "grad_norm": 0.37168340520887666,
+      "learning_rate": 0.00018916330573086953,
+      "loss": 0.655,
+      "step": 985
+    },
+    {
+      "epoch": 0.17528888888888888,
+      "grad_norm": 0.3760416902427817,
+      "learning_rate": 0.00018913722087088736,
+      "loss": 0.6525,
+      "step": 986
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.3695281597407702,
+      "learning_rate": 0.00018911110645732505,
+      "loss": 0.6682,
+      "step": 987
+    },
+    {
+      "epoch": 0.17564444444444444,
+      "grad_norm": 0.3729231290106923,
+      "learning_rate": 0.00018908496249884084,
+      "loss": 0.6777,
+      "step": 988
+    },
+    {
+      "epoch": 0.1758222222222222,
+      "grad_norm": 0.3816775557778491,
+      "learning_rate": 0.00018905878900410275,
+      "loss": 0.6856,
+      "step": 989
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3594686002260559,
+      "learning_rate": 0.00018903258598178876,
+      "loss": 0.6836,
+      "step": 990
+    },
+    {
+      "epoch": 0.17617777777777777,
+      "grad_norm": 0.38593876136560634,
+      "learning_rate": 0.00018900635344058645,
+      "loss": 0.7419,
+      "step": 991
+    },
+    {
+      "epoch": 0.17635555555555554,
+      "grad_norm": 0.3638198093469802,
+      "learning_rate": 0.00018898009138919322,
+      "loss": 0.6186,
+      "step": 992
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.354852666313951,
+      "learning_rate": 0.00018895379983631635,
+      "loss": 0.6307,
+      "step": 993
+    },
+    {
+      "epoch": 0.17671111111111112,
+      "grad_norm": 0.3799000596961431,
+      "learning_rate": 0.00018892747879067286,
+      "loss": 0.7246,
+      "step": 994
+    },
+    {
+      "epoch": 0.1768888888888889,
+      "grad_norm": 0.4116639770867036,
+      "learning_rate": 0.00018890112826098948,
+      "loss": 0.6918,
+      "step": 995
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.36948365701052455,
+      "learning_rate": 0.0001888747482560028,
+      "loss": 0.6586,
+      "step": 996
+    },
+    {
+      "epoch": 0.17724444444444445,
+      "grad_norm": 0.3844647498152997,
+      "learning_rate": 0.00018884833878445912,
+      "loss": 0.6963,
+      "step": 997
+    },
+    {
+      "epoch": 0.17742222222222223,
+      "grad_norm": 0.396528136258244,
+      "learning_rate": 0.00018882189985511456,
+      "loss": 0.7181,
+      "step": 998
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.34685819831335285,
+      "learning_rate": 0.00018879543147673502,
+      "loss": 0.6369,
+      "step": 999
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.36814974577224374,
+      "learning_rate": 0.00018876893365809606,
+      "loss": 0.6956,
+      "step": 1000
+    },
+    {
+      "epoch": 0.17795555555555556,
+      "grad_norm": 0.35038795028472525,
+      "learning_rate": 0.00018874240640798316,
+      "loss": 0.6551,
+      "step": 1001
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.3776709427322688,
+      "learning_rate": 0.0001887158497351914,
+      "loss": 0.6986,
+      "step": 1002
+    },
+    {
+      "epoch": 0.17831111111111111,
+      "grad_norm": 0.38182007870136747,
+      "learning_rate": 0.00018868926364852567,
+      "loss": 0.6663,
+      "step": 1003
+    },
+    {
+      "epoch": 0.1784888888888889,
+      "grad_norm": 0.39021592777210223,
+      "learning_rate": 0.0001886626481568007,
+      "loss": 0.7273,
+      "step": 1004
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.37070819002176686,
+      "learning_rate": 0.00018863600326884082,
+      "loss": 0.6592,
+      "step": 1005
+    },
+    {
+      "epoch": 0.17884444444444444,
+      "grad_norm": 0.368270598240532,
+      "learning_rate": 0.00018860932899348028,
+      "loss": 0.6836,
+      "step": 1006
+    },
+    {
+      "epoch": 0.17902222222222222,
+      "grad_norm": 0.3905545408808732,
+      "learning_rate": 0.0001885826253395629,
+      "loss": 0.684,
+      "step": 1007
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3659047557041727,
+      "learning_rate": 0.00018855589231594227,
+      "loss": 0.6591,
+      "step": 1008
+    },
+    {
+      "epoch": 0.17937777777777777,
+      "grad_norm": 0.37032199327762255,
+      "learning_rate": 0.0001885291299314819,
+      "loss": 0.6927,
+      "step": 1009
+    },
+    {
+      "epoch": 0.17955555555555555,
+      "grad_norm": 0.38979255847967137,
+      "learning_rate": 0.0001885023381950548,
+      "loss": 0.6703,
+      "step": 1010
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.35839924991171945,
+      "learning_rate": 0.00018847551711554384,
+      "loss": 0.673,
+      "step": 1011
+    },
+    {
+      "epoch": 0.1799111111111111,
+      "grad_norm": 0.3550182368855843,
+      "learning_rate": 0.0001884486667018416,
+      "loss": 0.6717,
+      "step": 1012
+    },
+    {
+      "epoch": 0.18008888888888888,
+      "grad_norm": 0.39327615547968137,
+      "learning_rate": 0.00018842178696285039,
+      "loss": 0.6808,
+      "step": 1013
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.39124528079775905,
+      "learning_rate": 0.00018839487790748216,
+      "loss": 0.6865,
+      "step": 1014
+    },
+    {
+      "epoch": 0.18044444444444444,
+      "grad_norm": 0.4126227435855467,
+      "learning_rate": 0.0001883679395446587,
+      "loss": 0.714,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1806222222222222,
+      "grad_norm": 0.37866834865644117,
+      "learning_rate": 0.00018834097188331143,
+      "loss": 0.6533,
+      "step": 1016
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.3790988565105198,
+      "learning_rate": 0.00018831397493238158,
+      "loss": 0.6755,
+      "step": 1017
+    },
+    {
+      "epoch": 0.18097777777777777,
+      "grad_norm": 0.35713611305074694,
+      "learning_rate": 0.00018828694870082,
+      "loss": 0.6608,
+      "step": 1018
+    },
+    {
+      "epoch": 0.18115555555555554,
+      "grad_norm": 0.39112658937760464,
+      "learning_rate": 0.00018825989319758724,
+      "loss": 0.6462,
+      "step": 1019
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.36100079597207796,
+      "learning_rate": 0.00018823280843165363,
+      "loss": 0.653,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18151111111111112,
+      "grad_norm": 0.3580258160869801,
+      "learning_rate": 0.00018820569441199917,
+      "loss": 0.6888,
+      "step": 1021
+    },
+    {
+      "epoch": 0.1816888888888889,
+      "grad_norm": 0.34887978321262814,
+      "learning_rate": 0.00018817855114761352,
+      "loss": 0.6378,
+      "step": 1022
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.35454007586306613,
+      "learning_rate": 0.00018815137864749612,
+      "loss": 0.6636,
+      "step": 1023
+    },
+    {
+      "epoch": 0.18204444444444445,
+      "grad_norm": 0.3384296633572218,
+      "learning_rate": 0.000188124176920656,
+      "loss": 0.6838,
+      "step": 1024
+    },
+    {
+      "epoch": 0.18222222222222223,
+      "grad_norm": 0.3640204934659153,
+      "learning_rate": 0.00018809694597611201,
+      "loss": 0.6872,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.35392282573488176,
+      "learning_rate": 0.00018806968582289253,
+      "loss": 0.7036,
+      "step": 1026
+    },
+    {
+      "epoch": 0.18257777777777778,
+      "grad_norm": 0.3781764662799718,
+      "learning_rate": 0.00018804239647003573,
+      "loss": 0.6995,
+      "step": 1027
+    },
+    {
+      "epoch": 0.18275555555555556,
+      "grad_norm": 0.3786562802147849,
+      "learning_rate": 0.00018801507792658942,
+      "loss": 0.6985,
+      "step": 1028
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.37520482173238917,
+      "learning_rate": 0.00018798773020161117,
+      "loss": 0.7104,
+      "step": 1029
+    },
+    {
+      "epoch": 0.1831111111111111,
+      "grad_norm": 0.3881926927530677,
+      "learning_rate": 0.0001879603533041681,
+      "loss": 0.6706,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1832888888888889,
+      "grad_norm": 0.40272556819394756,
+      "learning_rate": 0.00018793294724333707,
+      "loss": 0.695,
+      "step": 1031
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.3660902327326268,
+      "learning_rate": 0.00018790551202820462,
+      "loss": 0.7255,
+      "step": 1032
+    },
+    {
+      "epoch": 0.18364444444444444,
+      "grad_norm": 0.36260949205740023,
+      "learning_rate": 0.00018787804766786693,
+      "loss": 0.6562,
+      "step": 1033
+    },
+    {
+      "epoch": 0.18382222222222222,
+      "grad_norm": 0.364765675122991,
+      "learning_rate": 0.0001878505541714298,
+      "loss": 0.6426,
+      "step": 1034
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.36886135312743806,
+      "learning_rate": 0.00018782303154800886,
+      "loss": 0.6976,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18417777777777777,
+      "grad_norm": 0.38799783711154834,
+      "learning_rate": 0.00018779547980672917,
+      "loss": 0.6739,
+      "step": 1036
+    },
+    {
+      "epoch": 0.18435555555555555,
+      "grad_norm": 0.37845233085869384,
+      "learning_rate": 0.00018776789895672558,
+      "loss": 0.6657,
+      "step": 1037
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.3840572114088358,
+      "learning_rate": 0.00018774028900714256,
+      "loss": 0.6153,
+      "step": 1038
+    },
+    {
+      "epoch": 0.1847111111111111,
+      "grad_norm": 0.3970548335396864,
+      "learning_rate": 0.00018771264996713424,
+      "loss": 0.6633,
+      "step": 1039
+    },
+    {
+      "epoch": 0.18488888888888888,
+      "grad_norm": 0.37743976282598407,
+      "learning_rate": 0.0001876849818458644,
+      "loss": 0.6736,
+      "step": 1040
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.376065022022482,
+      "learning_rate": 0.00018765728465250644,
+      "loss": 0.6345,
+      "step": 1041
+    },
+    {
+      "epoch": 0.18524444444444443,
+      "grad_norm": 0.37318612648855637,
+      "learning_rate": 0.00018762955839624334,
+      "loss": 0.6859,
+      "step": 1042
+    },
+    {
+      "epoch": 0.1854222222222222,
+      "grad_norm": 0.3891986085529512,
+      "learning_rate": 0.0001876018030862679,
+      "loss": 0.7351,
+      "step": 1043
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.39996236404332447,
+      "learning_rate": 0.00018757401873178235,
+      "loss": 0.7462,
+      "step": 1044
+    },
+    {
+      "epoch": 0.18577777777777776,
+      "grad_norm": 0.35164083079776937,
+      "learning_rate": 0.00018754620534199864,
+      "loss": 0.6848,
+      "step": 1045
+    },
+    {
+      "epoch": 0.18595555555555557,
+      "grad_norm": 0.3540584455289317,
+      "learning_rate": 0.00018751836292613838,
+      "loss": 0.6246,
+      "step": 1046
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.37539709411181454,
+      "learning_rate": 0.00018749049149343274,
+      "loss": 0.6679,
+      "step": 1047
+    },
+    {
+      "epoch": 0.18631111111111112,
+      "grad_norm": 0.3821921339030591,
+      "learning_rate": 0.00018746259105312257,
+      "loss": 0.6825,
+      "step": 1048
+    },
+    {
+      "epoch": 0.1864888888888889,
+      "grad_norm": 0.38234697402910833,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 0.65,
+      "step": 1049
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.38949180460348376,
+      "learning_rate": 0.00018740670318669983,
+      "loss": 0.6988,
+      "step": 1050
+    },
+    {
+      "epoch": 0.18684444444444445,
+      "grad_norm": 0.37381928872938097,
+      "learning_rate": 0.000187378715779117,
+      "loss": 0.661,
+      "step": 1051
+    },
+    {
+      "epoch": 0.18702222222222223,
+      "grad_norm": 0.36656815412398536,
+      "learning_rate": 0.00018735069940098903,
+      "loss": 0.6857,
+      "step": 1052
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.38026359716448904,
+      "learning_rate": 0.00018732265406160476,
+      "loss": 0.6694,
+      "step": 1053
+    },
+    {
+      "epoch": 0.18737777777777778,
+      "grad_norm": 0.3784755671013034,
+      "learning_rate": 0.00018729457977026274,
+      "loss": 0.7031,
+      "step": 1054
+    },
+    {
+      "epoch": 0.18755555555555556,
+      "grad_norm": 0.3738130753243982,
+      "learning_rate": 0.00018726647653627093,
+      "loss": 0.6592,
+      "step": 1055
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.3678085945927551,
+      "learning_rate": 0.00018723834436894707,
+      "loss": 0.6805,
+      "step": 1056
+    },
+    {
+      "epoch": 0.1879111111111111,
+      "grad_norm": 0.3917628992193831,
+      "learning_rate": 0.00018721018327761842,
+      "loss": 0.7123,
+      "step": 1057
+    },
+    {
+      "epoch": 0.1880888888888889,
+      "grad_norm": 0.40096744433793907,
+      "learning_rate": 0.0001871819932716218,
+      "loss": 0.7008,
+      "step": 1058
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.3693331942483457,
+      "learning_rate": 0.0001871537743603037,
+      "loss": 0.7261,
+      "step": 1059
+    },
+    {
+      "epoch": 0.18844444444444444,
+      "grad_norm": 0.3690413756896375,
+      "learning_rate": 0.0001871255265530201,
+      "loss": 0.6289,
+      "step": 1060
+    },
+    {
+      "epoch": 0.18862222222222222,
+      "grad_norm": 0.36045732447648826,
+      "learning_rate": 0.0001870972498591366,
+      "loss": 0.6246,
+      "step": 1061
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4519262981598446,
+      "learning_rate": 0.00018706894428802845,
+      "loss": 0.7178,
+      "step": 1062
+    },
+    {
+      "epoch": 0.18897777777777777,
+      "grad_norm": 0.385301417601109,
+      "learning_rate": 0.0001870406098490803,
+      "loss": 0.6625,
+      "step": 1063
+    },
+    {
+      "epoch": 0.18915555555555555,
+      "grad_norm": 0.37344797452249623,
+      "learning_rate": 0.00018701224655168658,
+      "loss": 0.7175,
+      "step": 1064
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.3830148666530466,
+      "learning_rate": 0.0001869838544052511,
+      "loss": 0.6695,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1895111111111111,
+      "grad_norm": 0.4002807163476561,
+      "learning_rate": 0.00018695543341918736,
+      "loss": 0.7022,
+      "step": 1066
+    },
+    {
+      "epoch": 0.18968888888888888,
+      "grad_norm": 0.36048817733599303,
+      "learning_rate": 0.00018692698360291837,
+      "loss": 0.611,
+      "step": 1067
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.37550275521026466,
+      "learning_rate": 0.00018689850496587674,
+      "loss": 0.6527,
+      "step": 1068
+    },
+    {
+      "epoch": 0.19004444444444443,
+      "grad_norm": 0.38382058586566487,
+      "learning_rate": 0.0001868699975175045,
+      "loss": 0.7059,
+      "step": 1069
+    },
+    {
+      "epoch": 0.1902222222222222,
+      "grad_norm": 0.38953057931205226,
+      "learning_rate": 0.00018684146126725351,
+      "loss": 0.6981,
+      "step": 1070
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.38404872425187175,
+      "learning_rate": 0.00018681289622458485,
+      "loss": 0.6733,
+      "step": 1071
+    },
+    {
+      "epoch": 0.1905777777777778,
+      "grad_norm": 0.3899713802272258,
+      "learning_rate": 0.00018678430239896937,
+      "loss": 0.6879,
+      "step": 1072
+    },
+    {
+      "epoch": 0.19075555555555557,
+      "grad_norm": 0.4081394693136384,
+      "learning_rate": 0.00018675567979988743,
+      "loss": 0.7543,
+      "step": 1073
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.3680491743867154,
+      "learning_rate": 0.00018672702843682882,
+      "loss": 0.6949,
+      "step": 1074
+    },
+    {
+      "epoch": 0.19111111111111112,
+      "grad_norm": 0.4201453327037143,
+      "learning_rate": 0.000186698348319293,
+      "loss": 0.7089,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1912888888888889,
+      "grad_norm": 0.38747821429503604,
+      "learning_rate": 0.00018666963945678888,
+      "loss": 0.7007,
+      "step": 1076
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.3713182780208938,
+      "learning_rate": 0.00018664090185883491,
+      "loss": 0.6327,
+      "step": 1077
+    },
+    {
+      "epoch": 0.19164444444444445,
+      "grad_norm": 0.4076024682018734,
+      "learning_rate": 0.00018661213553495913,
+      "loss": 0.7065,
+      "step": 1078
+    },
+    {
+      "epoch": 0.19182222222222223,
+      "grad_norm": 0.37472837532627157,
+      "learning_rate": 0.00018658334049469904,
+      "loss": 0.6532,
+      "step": 1079
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.377628716421935,
+      "learning_rate": 0.00018655451674760168,
+      "loss": 0.7094,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19217777777777778,
+      "grad_norm": 0.426814684041566,
+      "learning_rate": 0.00018652566430322356,
+      "loss": 0.6952,
+      "step": 1081
+    },
+    {
+      "epoch": 0.19235555555555556,
+      "grad_norm": 0.3935512954679948,
+      "learning_rate": 0.00018649678317113084,
+      "loss": 0.7195,
+      "step": 1082
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.3554223296686633,
+      "learning_rate": 0.000186467873360899,
+      "loss": 0.6075,
+      "step": 1083
+    },
+    {
+      "epoch": 0.1927111111111111,
+      "grad_norm": 0.4143116372175908,
+      "learning_rate": 0.00018643893488211327,
+      "loss": 0.754,
+      "step": 1084
+    },
+    {
+      "epoch": 0.1928888888888889,
+      "grad_norm": 0.39760016319044933,
+      "learning_rate": 0.00018640996774436808,
+      "loss": 0.7025,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.41038883584327823,
+      "learning_rate": 0.00018638097195726764,
+      "loss": 0.7028,
+      "step": 1086
+    },
+    {
+      "epoch": 0.19324444444444444,
+      "grad_norm": 0.3971912174566233,
+      "learning_rate": 0.00018635194753042553,
+      "loss": 0.6829,
+      "step": 1087
+    },
+    {
+      "epoch": 0.19342222222222222,
+      "grad_norm": 0.3866693652689321,
+      "learning_rate": 0.00018632289447346483,
+      "loss": 0.6848,
+      "step": 1088
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.3750370247759865,
+      "learning_rate": 0.00018629381279601813,
+      "loss": 0.6715,
+      "step": 1089
+    },
+    {
+      "epoch": 0.19377777777777777,
+      "grad_norm": 0.37228851729537327,
+      "learning_rate": 0.00018626470250772748,
+      "loss": 0.6566,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19395555555555555,
+      "grad_norm": 0.36048117851376804,
+      "learning_rate": 0.00018623556361824445,
+      "loss": 0.6651,
+      "step": 1091
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.38891722694839975,
+      "learning_rate": 0.00018620639613723013,
+      "loss": 0.6675,
+      "step": 1092
+    },
+    {
+      "epoch": 0.1943111111111111,
+      "grad_norm": 0.3675322596141409,
+      "learning_rate": 0.00018617720007435497,
+      "loss": 0.713,
+      "step": 1093
+    },
+    {
+      "epoch": 0.19448888888888888,
+      "grad_norm": 0.3682768200879474,
+      "learning_rate": 0.00018614797543929903,
+      "loss": 0.6895,
+      "step": 1094
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.3645532602208841,
+      "learning_rate": 0.00018611872224175177,
+      "loss": 0.6805,
+      "step": 1095
+    },
+    {
+      "epoch": 0.19484444444444443,
+      "grad_norm": 0.37646682665981573,
+      "learning_rate": 0.00018608944049141205,
+      "loss": 0.6566,
+      "step": 1096
+    },
+    {
+      "epoch": 0.19502222222222224,
+      "grad_norm": 0.36288741972391836,
+      "learning_rate": 0.00018606013019798837,
+      "loss": 0.6724,
+      "step": 1097
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.39136349420864747,
+      "learning_rate": 0.00018603079137119864,
+      "loss": 0.667,
+      "step": 1098
+    },
+    {
+      "epoch": 0.1953777777777778,
+      "grad_norm": 0.3861845596742569,
+      "learning_rate": 0.00018600142402077006,
+      "loss": 0.6934,
+      "step": 1099
+    },
+    {
+      "epoch": 0.19555555555555557,
+      "grad_norm": 0.36018266872198745,
+      "learning_rate": 0.00018597202815643952,
+      "loss": 0.6939,
+      "step": 1100
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.3583860976993596,
+      "learning_rate": 0.00018594260378795323,
+      "loss": 0.6655,
+      "step": 1101
+    },
+    {
+      "epoch": 0.19591111111111112,
+      "grad_norm": 0.3635719108851077,
+      "learning_rate": 0.00018591315092506688,
+      "loss": 0.6408,
+      "step": 1102
+    },
+    {
+      "epoch": 0.1960888888888889,
+      "grad_norm": 1.5344025595617339,
+      "learning_rate": 0.0001858836695775456,
+      "loss": 0.6816,
+      "step": 1103
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.38066363645383966,
+      "learning_rate": 0.000185854159755164,
+      "loss": 0.6644,
+      "step": 1104
+    },
+    {
+      "epoch": 0.19644444444444445,
+      "grad_norm": 0.37412340137287314,
+      "learning_rate": 0.00018582462146770614,
+      "loss": 0.6683,
+      "step": 1105
+    },
+    {
+      "epoch": 0.19662222222222223,
+      "grad_norm": 0.3715445539287525,
+      "learning_rate": 0.00018579505472496544,
+      "loss": 0.6458,
+      "step": 1106
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.38897752233594024,
+      "learning_rate": 0.00018576545953674476,
+      "loss": 0.6806,
+      "step": 1107
+    },
+    {
+      "epoch": 0.19697777777777778,
+      "grad_norm": 0.3869561469046338,
+      "learning_rate": 0.00018573583591285648,
+      "loss": 0.6239,
+      "step": 1108
+    },
+    {
+      "epoch": 0.19715555555555556,
+      "grad_norm": 0.3909276500815709,
+      "learning_rate": 0.00018570618386312235,
+      "loss": 0.728,
+      "step": 1109
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.37638523453247363,
+      "learning_rate": 0.00018567650339737358,
+      "loss": 0.6342,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1975111111111111,
+      "grad_norm": 0.36314456399322503,
+      "learning_rate": 0.0001856467945254507,
+      "loss": 0.6378,
+      "step": 1111
+    },
+    {
+      "epoch": 0.1976888888888889,
+      "grad_norm": 0.3690453086516602,
+      "learning_rate": 0.0001856170572572038,
+      "loss": 0.6807,
+      "step": 1112
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.3829964987340547,
+      "learning_rate": 0.00018558729160249229,
+      "loss": 0.6822,
+      "step": 1113
+    },
+    {
+      "epoch": 0.19804444444444444,
+      "grad_norm": 0.38615860279156744,
+      "learning_rate": 0.00018555749757118498,
+      "loss": 0.6673,
+      "step": 1114
+    },
+    {
+      "epoch": 0.19822222222222222,
+      "grad_norm": 0.3771590897552604,
+      "learning_rate": 0.00018552767517316022,
+      "loss": 0.6499,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4042250038542116,
+      "learning_rate": 0.00018549782441830556,
+      "loss": 0.6549,
+      "step": 1116
+    },
+    {
+      "epoch": 0.19857777777777777,
+      "grad_norm": 0.3644069189474034,
+      "learning_rate": 0.00018546794531651816,
+      "loss": 0.6466,
+      "step": 1117
+    },
+    {
+      "epoch": 0.19875555555555555,
+      "grad_norm": 0.3879639771946161,
+      "learning_rate": 0.00018543803787770443,
+      "loss": 0.7134,
+      "step": 1118
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.38777353313294133,
+      "learning_rate": 0.00018540810211178024,
+      "loss": 0.6996,
+      "step": 1119
+    },
+    {
+      "epoch": 0.1991111111111111,
+      "grad_norm": 0.3712480770903855,
+      "learning_rate": 0.0001853781380286708,
+      "loss": 0.724,
+      "step": 1120
+    },
+    {
+      "epoch": 0.19928888888888888,
+      "grad_norm": 0.3586363074948968,
+      "learning_rate": 0.00018534814563831082,
+      "loss": 0.6881,
+      "step": 1121
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.3699735860253999,
+      "learning_rate": 0.00018531812495064428,
+      "loss": 0.6816,
+      "step": 1122
+    },
+    {
+      "epoch": 0.19964444444444446,
+      "grad_norm": 0.3763551299303036,
+      "learning_rate": 0.0001852880759756246,
+      "loss": 0.62,
+      "step": 1123
+    },
+    {
+      "epoch": 0.19982222222222223,
+      "grad_norm": 0.37159190733663094,
+      "learning_rate": 0.0001852579987232145,
+      "loss": 0.7075,
+      "step": 1124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.369739302552831,
+      "learning_rate": 0.00018522789320338622,
+      "loss": 0.6516,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2001777777777778,
+      "grad_norm": 0.3880099189965292,
+      "learning_rate": 0.00018519775942612128,
+      "loss": 0.6751,
+      "step": 1126
+    },
+    {
+      "epoch": 0.20035555555555556,
+      "grad_norm": 0.39833414610430823,
+      "learning_rate": 0.0001851675974014105,
+      "loss": 0.6953,
+      "step": 1127
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.40907109587737156,
+      "learning_rate": 0.0001851374071392543,
+      "loss": 0.6608,
+      "step": 1128
+    },
+    {
+      "epoch": 0.20071111111111112,
+      "grad_norm": 0.3701645480867035,
+      "learning_rate": 0.0001851071886496621,
+      "loss": 0.6348,
+      "step": 1129
+    },
+    {
+      "epoch": 0.2008888888888889,
+      "grad_norm": 0.3721107847160279,
+      "learning_rate": 0.0001850769419426531,
+      "loss": 0.6942,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.42932904686343043,
+      "learning_rate": 0.00018504666702825548,
+      "loss": 0.7384,
+      "step": 1131
+    },
+    {
+      "epoch": 0.20124444444444445,
+      "grad_norm": 0.38971189939948153,
+      "learning_rate": 0.00018501636391650701,
+      "loss": 0.647,
+      "step": 1132
+    },
+    {
+      "epoch": 0.20142222222222222,
+      "grad_norm": 0.36170462957011823,
+      "learning_rate": 0.0001849860326174547,
+      "loss": 0.6633,
+      "step": 1133
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4286187198519224,
+      "learning_rate": 0.00018495567314115495,
+      "loss": 0.6968,
+      "step": 1134
+    },
+    {
+      "epoch": 0.20177777777777778,
+      "grad_norm": 0.3949704438013471,
+      "learning_rate": 0.00018492528549767353,
+      "loss": 0.701,
+      "step": 1135
+    },
+    {
+      "epoch": 0.20195555555555555,
+      "grad_norm": 0.39499882521052326,
+      "learning_rate": 0.00018489486969708543,
+      "loss": 0.6867,
+      "step": 1136
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.3849631930084498,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 0.6707,
+      "step": 1137
+    },
+    {
+      "epoch": 0.2023111111111111,
+      "grad_norm": 0.38731904488506846,
+      "learning_rate": 0.0001848339536649363,
+      "loss": 0.7322,
+      "step": 1138
+    },
+    {
+      "epoch": 0.20248888888888888,
+      "grad_norm": 0.384619797734893,
+      "learning_rate": 0.00018480345345357204,
+      "loss": 0.711,
+      "step": 1139
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.3669035594777077,
+      "learning_rate": 0.00018477292512549475,
+      "loss": 0.6201,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20284444444444444,
+      "grad_norm": 0.36143735625158474,
+      "learning_rate": 0.00018474236869082616,
+      "loss": 0.6939,
+      "step": 1141
+    },
+    {
+      "epoch": 0.20302222222222222,
+      "grad_norm": 0.38597058955210695,
+      "learning_rate": 0.00018471178415969722,
+      "loss": 0.6483,
+      "step": 1142
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.3661880236347438,
+      "learning_rate": 0.00018468117154224839,
+      "loss": 0.6735,
+      "step": 1143
+    },
+    {
+      "epoch": 0.20337777777777777,
+      "grad_norm": 0.3666624021547499,
+      "learning_rate": 0.00018465053084862923,
+      "loss": 0.669,
+      "step": 1144
+    },
+    {
+      "epoch": 0.20355555555555555,
+      "grad_norm": 0.3685974860703644,
+      "learning_rate": 0.00018461986208899878,
+      "loss": 0.6852,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.3775971460694314,
+      "learning_rate": 0.00018458916527352526,
+      "loss": 0.6222,
+      "step": 1146
+    },
+    {
+      "epoch": 0.2039111111111111,
+      "grad_norm": 0.37062533603571957,
+      "learning_rate": 0.00018455844041238625,
+      "loss": 0.6558,
+      "step": 1147
+    },
+    {
+      "epoch": 0.20408888888888888,
+      "grad_norm": 0.36951976564161504,
+      "learning_rate": 0.0001845276875157687,
+      "loss": 0.6766,
+      "step": 1148
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.36297876171397325,
+      "learning_rate": 0.0001844969065938687,
+      "loss": 0.6684,
+      "step": 1149
+    },
+    {
+      "epoch": 0.20444444444444446,
+      "grad_norm": 0.3855925854954929,
+      "learning_rate": 0.0001844660976568917,
+      "loss": 0.6203,
+      "step": 1150
+    },
+    {
+      "epoch": 0.20462222222222223,
+      "grad_norm": 0.36069190151612895,
+      "learning_rate": 0.00018443526071505254,
+      "loss": 0.6697,
+      "step": 1151
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3919835110382578,
+      "learning_rate": 0.0001844043957785752,
+      "loss": 0.66,
+      "step": 1152
+    },
+    {
+      "epoch": 0.2049777777777778,
+      "grad_norm": 0.3871294120088186,
+      "learning_rate": 0.00018437350285769295,
+      "loss": 0.654,
+      "step": 1153
+    },
+    {
+      "epoch": 0.20515555555555556,
+      "grad_norm": 0.3764036048957266,
+      "learning_rate": 0.00018434258196264845,
+      "loss": 0.6317,
+      "step": 1154
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.3569369602465215,
+      "learning_rate": 0.00018431163310369354,
+      "loss": 0.6095,
+      "step": 1155
+    },
+    {
+      "epoch": 0.20551111111111112,
+      "grad_norm": 0.3682996887968764,
+      "learning_rate": 0.00018428065629108934,
+      "loss": 0.6586,
+      "step": 1156
+    },
+    {
+      "epoch": 0.2056888888888889,
+      "grad_norm": 0.362514413631995,
+      "learning_rate": 0.00018424965153510635,
+      "loss": 0.6718,
+      "step": 1157
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.3569257258786691,
+      "learning_rate": 0.00018421861884602414,
+      "loss": 0.6714,
+      "step": 1158
+    },
+    {
+      "epoch": 0.20604444444444445,
+      "grad_norm": 0.36420710820280233,
+      "learning_rate": 0.0001841875582341317,
+      "loss": 0.6595,
+      "step": 1159
+    },
+    {
+      "epoch": 0.20622222222222222,
+      "grad_norm": 0.3667777885499808,
+      "learning_rate": 0.0001841564697097272,
+      "loss": 0.6966,
+      "step": 1160
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.3698696460020529,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.6739,
+      "step": 1161
+    },
+    {
+      "epoch": 0.20657777777777778,
+      "grad_norm": 0.3603207099840638,
+      "learning_rate": 0.00018409420896462112,
+      "loss": 0.676,
+      "step": 1162
+    },
+    {
+      "epoch": 0.20675555555555555,
+      "grad_norm": 0.35764408962982924,
+      "learning_rate": 0.00018406303676456217,
+      "loss": 0.6324,
+      "step": 1163
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.3786941692537567,
+      "learning_rate": 0.00018403183669327646,
+      "loss": 0.6818,
+      "step": 1164
+    },
+    {
+      "epoch": 0.2071111111111111,
+      "grad_norm": 0.4034681434660428,
+      "learning_rate": 0.00018400060876110842,
+      "loss": 0.6521,
+      "step": 1165
+    },
+    {
+      "epoch": 0.20728888888888888,
+      "grad_norm": 0.37000246611017584,
+      "learning_rate": 0.00018396935297841166,
+      "loss": 0.6702,
+      "step": 1166
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.3936626024740484,
+      "learning_rate": 0.00018393806935554916,
+      "loss": 0.6588,
+      "step": 1167
+    },
+    {
+      "epoch": 0.20764444444444444,
+      "grad_norm": 0.40252913735608165,
+      "learning_rate": 0.00018390675790289302,
+      "loss": 0.7248,
+      "step": 1168
+    },
+    {
+      "epoch": 0.2078222222222222,
+      "grad_norm": 0.391218634766642,
+      "learning_rate": 0.0001838754186308246,
+      "loss": 0.6775,
+      "step": 1169
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.39203492521533034,
+      "learning_rate": 0.0001838440515497345,
+      "loss": 0.67,
+      "step": 1170
+    },
+    {
+      "epoch": 0.20817777777777777,
+      "grad_norm": 0.3662629325527311,
+      "learning_rate": 0.0001838126566700225,
+      "loss": 0.6925,
+      "step": 1171
+    },
+    {
+      "epoch": 0.20835555555555554,
+      "grad_norm": 0.40363709245565926,
+      "learning_rate": 0.00018378123400209764,
+      "loss": 0.6992,
+      "step": 1172
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.3798693237666605,
+      "learning_rate": 0.00018374978355637813,
+      "loss": 0.7335,
+      "step": 1173
+    },
+    {
+      "epoch": 0.2087111111111111,
+      "grad_norm": 0.37116874568327346,
+      "learning_rate": 0.00018371830534329143,
+      "loss": 0.666,
+      "step": 1174
+    },
+    {
+      "epoch": 0.2088888888888889,
+      "grad_norm": 0.38959958943797834,
+      "learning_rate": 0.0001836867993732742,
+      "loss": 0.692,
+      "step": 1175
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.35332744570073377,
+      "learning_rate": 0.00018365526565677226,
+      "loss": 0.6204,
+      "step": 1176
+    },
+    {
+      "epoch": 0.20924444444444446,
+      "grad_norm": 0.3744133796064289,
+      "learning_rate": 0.00018362370420424068,
+      "loss": 0.6645,
+      "step": 1177
+    },
+    {
+      "epoch": 0.20942222222222223,
+      "grad_norm": 0.3834140013396747,
+      "learning_rate": 0.00018359211502614372,
+      "loss": 0.6731,
+      "step": 1178
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.3888578258105978,
+      "learning_rate": 0.00018356049813295476,
+      "loss": 0.65,
+      "step": 1179
+    },
+    {
+      "epoch": 0.20977777777777779,
+      "grad_norm": 0.38652518918507633,
+      "learning_rate": 0.00018352885353515653,
+      "loss": 0.6445,
+      "step": 1180
+    },
+    {
+      "epoch": 0.20995555555555556,
+      "grad_norm": 0.40431841752340153,
+      "learning_rate": 0.00018349718124324076,
+      "loss": 0.6865,
+      "step": 1181
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.3718903706123179,
+      "learning_rate": 0.00018346548126770847,
+      "loss": 0.6711,
+      "step": 1182
+    },
+    {
+      "epoch": 0.21031111111111112,
+      "grad_norm": 0.3612738256205751,
+      "learning_rate": 0.00018343375361906984,
+      "loss": 0.6569,
+      "step": 1183
+    },
+    {
+      "epoch": 0.2104888888888889,
+      "grad_norm": 0.37243505463992294,
+      "learning_rate": 0.00018340199830784422,
+      "loss": 0.6743,
+      "step": 1184
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.3927398200160076,
+      "learning_rate": 0.00018337021534456014,
+      "loss": 0.6656,
+      "step": 1185
+    },
+    {
+      "epoch": 0.21084444444444445,
+      "grad_norm": 0.36802493665053415,
+      "learning_rate": 0.00018333840473975526,
+      "loss": 0.6839,
+      "step": 1186
+    },
+    {
+      "epoch": 0.21102222222222222,
+      "grad_norm": 0.3834102880243585,
+      "learning_rate": 0.00018330656650397646,
+      "loss": 0.6849,
+      "step": 1187
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.38712689986378573,
+      "learning_rate": 0.00018327470064777974,
+      "loss": 0.6995,
+      "step": 1188
+    },
+    {
+      "epoch": 0.21137777777777778,
+      "grad_norm": 0.3621021624951718,
+      "learning_rate": 0.0001832428071817303,
+      "loss": 0.7123,
+      "step": 1189
+    },
+    {
+      "epoch": 0.21155555555555555,
+      "grad_norm": 0.40037133971448435,
+      "learning_rate": 0.00018321088611640245,
+      "loss": 0.6669,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.3958701078086994,
+      "learning_rate": 0.00018317893746237963,
+      "loss": 0.7173,
+      "step": 1191
+    },
+    {
+      "epoch": 0.2119111111111111,
+      "grad_norm": 0.3851579617459599,
+      "learning_rate": 0.00018314696123025454,
+      "loss": 0.6623,
+      "step": 1192
+    },
+    {
+      "epoch": 0.21208888888888888,
+      "grad_norm": 0.3890405939105116,
+      "learning_rate": 0.00018311495743062887,
+      "loss": 0.6764,
+      "step": 1193
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.37172193882892485,
+      "learning_rate": 0.0001830829260741136,
+      "loss": 0.6954,
+      "step": 1194
+    },
+    {
+      "epoch": 0.21244444444444444,
+      "grad_norm": 0.3491154516506322,
+      "learning_rate": 0.00018305086717132873,
+      "loss": 0.6467,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2126222222222222,
+      "grad_norm": 0.3767031029786711,
+      "learning_rate": 0.00018301878073290345,
+      "loss": 0.7115,
+      "step": 1196
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4938887172264978,
+      "learning_rate": 0.00018298666676947606,
+      "loss": 0.6541,
+      "step": 1197
+    },
+    {
+      "epoch": 0.21297777777777777,
+      "grad_norm": 0.36307327022000824,
+      "learning_rate": 0.000182954525291694,
+      "loss": 0.709,
+      "step": 1198
+    },
+    {
+      "epoch": 0.21315555555555554,
+      "grad_norm": 0.3669493220248955,
+      "learning_rate": 0.0001829223563102138,
+      "loss": 0.6255,
+      "step": 1199
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.38854350315322106,
+      "learning_rate": 0.0001828901598357012,
+      "loss": 0.7025,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21351111111111112,
+      "grad_norm": 0.42280080060878183,
+      "learning_rate": 0.00018285793587883092,
+      "loss": 0.7177,
+      "step": 1201
+    },
+    {
+      "epoch": 0.2136888888888889,
+      "grad_norm": 0.3828164078513971,
+      "learning_rate": 0.0001828256844502869,
+      "loss": 0.707,
+      "step": 1202
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.3730267894679604,
+      "learning_rate": 0.00018279340556076216,
+      "loss": 0.6724,
+      "step": 1203
+    },
+    {
+      "epoch": 0.21404444444444445,
+      "grad_norm": 0.3543769112513274,
+      "learning_rate": 0.00018276109922095877,
+      "loss": 0.6726,
+      "step": 1204
+    },
+    {
+      "epoch": 0.21422222222222223,
+      "grad_norm": 0.35576667055637556,
+      "learning_rate": 0.00018272876544158794,
+      "loss": 0.6317,
+      "step": 1205
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.366934105456551,
+      "learning_rate": 0.00018269640423337007,
+      "loss": 0.6926,
+      "step": 1206
+    },
+    {
+      "epoch": 0.21457777777777778,
+      "grad_norm": 0.3760879476029945,
+      "learning_rate": 0.0001826640156070345,
+      "loss": 0.6379,
+      "step": 1207
+    },
+    {
+      "epoch": 0.21475555555555556,
+      "grad_norm": 0.3519381009965438,
+      "learning_rate": 0.0001826315995733197,
+      "loss": 0.6696,
+      "step": 1208
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.35780396962791067,
+      "learning_rate": 0.0001825991561429733,
+      "loss": 0.6456,
+      "step": 1209
+    },
+    {
+      "epoch": 0.21511111111111111,
+      "grad_norm": 0.4091743342317021,
+      "learning_rate": 0.00018256668532675197,
+      "loss": 0.6812,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2152888888888889,
+      "grad_norm": 0.36528173330022046,
+      "learning_rate": 0.0001825341871354215,
+      "loss": 0.6424,
+      "step": 1211
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.38570555070167545,
+      "learning_rate": 0.00018250166157975661,
+      "loss": 0.7056,
+      "step": 1212
+    },
+    {
+      "epoch": 0.21564444444444444,
+      "grad_norm": 0.38738495633750414,
+      "learning_rate": 0.00018246910867054125,
+      "loss": 0.682,
+      "step": 1213
+    },
+    {
+      "epoch": 0.21582222222222222,
+      "grad_norm": 0.8338716476837322,
+      "learning_rate": 0.0001824365284185684,
+      "loss": 0.6588,
+      "step": 1214
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.3464376813860515,
+      "learning_rate": 0.00018240392083464007,
+      "loss": 0.6623,
+      "step": 1215
+    },
+    {
+      "epoch": 0.21617777777777777,
+      "grad_norm": 0.3496356519858567,
+      "learning_rate": 0.00018237128592956737,
+      "loss": 0.6419,
+      "step": 1216
+    },
+    {
+      "epoch": 0.21635555555555555,
+      "grad_norm": 0.37612346679787867,
+      "learning_rate": 0.00018233862371417047,
+      "loss": 0.6733,
+      "step": 1217
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.36049843791702,
+      "learning_rate": 0.00018230593419927852,
+      "loss": 0.6382,
+      "step": 1218
+    },
+    {
+      "epoch": 0.2167111111111111,
+      "grad_norm": 0.387637216036029,
+      "learning_rate": 0.00018227321739572983,
+      "loss": 0.642,
+      "step": 1219
+    },
+    {
+      "epoch": 0.21688888888888888,
+      "grad_norm": 0.3861937665549605,
+      "learning_rate": 0.00018224047331437165,
+      "loss": 0.6769,
+      "step": 1220
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.36731108686428177,
+      "learning_rate": 0.0001822077019660604,
+      "loss": 0.7014,
+      "step": 1221
+    },
+    {
+      "epoch": 0.21724444444444443,
+      "grad_norm": 0.388699854849361,
+      "learning_rate": 0.00018217490336166144,
+      "loss": 0.7072,
+      "step": 1222
+    },
+    {
+      "epoch": 0.2174222222222222,
+      "grad_norm": 0.3819613871631891,
+      "learning_rate": 0.00018214207751204918,
+      "loss": 0.7297,
+      "step": 1223
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3680423732001841,
+      "learning_rate": 0.00018210922442810708,
+      "loss": 0.6336,
+      "step": 1224
+    },
+    {
+      "epoch": 0.21777777777777776,
+      "grad_norm": 0.3376134165731433,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 0.6477,
+      "step": 1225
+    },
+    {
+      "epoch": 0.21795555555555557,
+      "grad_norm": 0.37275274132289626,
+      "learning_rate": 0.0001820434366008124,
+      "loss": 0.6881,
+      "step": 1226
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.38555044919326564,
+      "learning_rate": 0.00018201050187927184,
+      "loss": 0.6754,
+      "step": 1227
+    },
+    {
+      "epoch": 0.21831111111111112,
+      "grad_norm": 0.3594542843830291,
+      "learning_rate": 0.00018197753996702557,
+      "loss": 0.6099,
+      "step": 1228
+    },
+    {
+      "epoch": 0.2184888888888889,
+      "grad_norm": 0.377758974478313,
+      "learning_rate": 0.00018194455087500218,
+      "loss": 0.6943,
+      "step": 1229
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.38270904518351395,
+      "learning_rate": 0.00018191153461413916,
+      "loss": 0.6981,
+      "step": 1230
+    },
+    {
+      "epoch": 0.21884444444444445,
+      "grad_norm": 0.36243414789220363,
+      "learning_rate": 0.00018187849119538318,
+      "loss": 0.6399,
+      "step": 1231
+    },
+    {
+      "epoch": 0.21902222222222223,
+      "grad_norm": 0.3555052323035816,
+      "learning_rate": 0.00018184542062968983,
+      "loss": 0.6332,
+      "step": 1232
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.3499507977114114,
+      "learning_rate": 0.00018181232292802365,
+      "loss": 0.6755,
+      "step": 1233
+    },
+    {
+      "epoch": 0.21937777777777778,
+      "grad_norm": 0.37627367466076955,
+      "learning_rate": 0.0001817791981013583,
+      "loss": 0.6751,
+      "step": 1234
+    },
+    {
+      "epoch": 0.21955555555555556,
+      "grad_norm": 0.3683451874640109,
+      "learning_rate": 0.00018174604616067632,
+      "loss": 0.6456,
+      "step": 1235
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.36823981386516685,
+      "learning_rate": 0.00018171286711696934,
+      "loss": 0.6662,
+      "step": 1236
+    },
+    {
+      "epoch": 0.2199111111111111,
+      "grad_norm": 0.3560313882981796,
+      "learning_rate": 0.00018167966098123786,
+      "loss": 0.6652,
+      "step": 1237
+    },
+    {
+      "epoch": 0.2200888888888889,
+      "grad_norm": 0.37564553359701064,
+      "learning_rate": 0.00018164642776449146,
+      "loss": 0.6878,
+      "step": 1238
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.38845477182844085,
+      "learning_rate": 0.00018161316747774864,
+      "loss": 0.7153,
+      "step": 1239
+    },
+    {
+      "epoch": 0.22044444444444444,
+      "grad_norm": 0.3743997347569959,
+      "learning_rate": 0.00018157988013203693,
+      "loss": 0.6192,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22062222222222222,
+      "grad_norm": 0.3673109626908155,
+      "learning_rate": 0.00018154656573839275,
+      "loss": 0.6409,
+      "step": 1241
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.36651144212340137,
+      "learning_rate": 0.0001815132243078616,
+      "loss": 0.6669,
+      "step": 1242
+    },
+    {
+      "epoch": 0.22097777777777777,
+      "grad_norm": 0.3310886182609289,
+      "learning_rate": 0.00018147985585149784,
+      "loss": 0.6455,
+      "step": 1243
+    },
+    {
+      "epoch": 0.22115555555555555,
+      "grad_norm": 0.37941241054018465,
+      "learning_rate": 0.00018144646038036486,
+      "loss": 0.7017,
+      "step": 1244
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.38345594208676165,
+      "learning_rate": 0.00018141303790553495,
+      "loss": 0.6808,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2215111111111111,
+      "grad_norm": 0.4088640247515805,
+      "learning_rate": 0.00018137958843808936,
+      "loss": 0.6985,
+      "step": 1246
+    },
+    {
+      "epoch": 0.22168888888888888,
+      "grad_norm": 0.36904643335418275,
+      "learning_rate": 0.0001813461119891184,
+      "loss": 0.7165,
+      "step": 1247
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.35382461144969835,
+      "learning_rate": 0.00018131260856972116,
+      "loss": 0.68,
+      "step": 1248
+    },
+    {
+      "epoch": 0.22204444444444443,
+      "grad_norm": 0.3531729011678622,
+      "learning_rate": 0.0001812790781910058,
+      "loss": 0.6658,
+      "step": 1249
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.35977107790245616,
+      "learning_rate": 0.0001812455208640893,
+      "loss": 0.6793,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.3518269563523556,
+      "learning_rate": 0.0001812119366000977,
+      "loss": 0.6331,
+      "step": 1251
+    },
+    {
+      "epoch": 0.2225777777777778,
+      "grad_norm": 0.36462610091755804,
+      "learning_rate": 0.00018117832541016587,
+      "loss": 0.7048,
+      "step": 1252
+    },
+    {
+      "epoch": 0.22275555555555557,
+      "grad_norm": 0.35318753293114574,
+      "learning_rate": 0.0001811446873054377,
+      "loss": 0.5774,
+      "step": 1253
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.35081977043426243,
+      "learning_rate": 0.0001811110222970659,
+      "loss": 0.65,
+      "step": 1254
+    },
+    {
+      "epoch": 0.22311111111111112,
+      "grad_norm": 0.39688865382910304,
+      "learning_rate": 0.00018107733039621223,
+      "loss": 0.6775,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2232888888888889,
+      "grad_norm": 0.37686866681610026,
+      "learning_rate": 0.00018104361161404723,
+      "loss": 0.6703,
+      "step": 1256
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.36142850992294384,
+      "learning_rate": 0.00018100986596175046,
+      "loss": 0.6314,
+      "step": 1257
+    },
+    {
+      "epoch": 0.22364444444444445,
+      "grad_norm": 0.3798419234632831,
+      "learning_rate": 0.00018097609345051025,
+      "loss": 0.7103,
+      "step": 1258
+    },
+    {
+      "epoch": 0.22382222222222223,
+      "grad_norm": 0.3712502957307875,
+      "learning_rate": 0.00018094229409152402,
+      "loss": 0.6544,
+      "step": 1259
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.36049302217826423,
+      "learning_rate": 0.00018090846789599798,
+      "loss": 0.7147,
+      "step": 1260
+    },
+    {
+      "epoch": 0.22417777777777778,
+      "grad_norm": 0.3523508248932008,
+      "learning_rate": 0.00018087461487514722,
+      "loss": 0.708,
+      "step": 1261
+    },
+    {
+      "epoch": 0.22435555555555556,
+      "grad_norm": 0.36155258534713897,
+      "learning_rate": 0.0001808407350401958,
+      "loss": 0.6691,
+      "step": 1262
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.3544347270915348,
+      "learning_rate": 0.0001808068284023766,
+      "loss": 0.6486,
+      "step": 1263
+    },
+    {
+      "epoch": 0.2247111111111111,
+      "grad_norm": 0.3555104595715303,
+      "learning_rate": 0.00018077289497293143,
+      "loss": 0.6257,
+      "step": 1264
+    },
+    {
+      "epoch": 0.2248888888888889,
+      "grad_norm": 0.356306142872019,
+      "learning_rate": 0.00018073893476311097,
+      "loss": 0.6621,
+      "step": 1265
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.3504699812872086,
+      "learning_rate": 0.00018070494778417477,
+      "loss": 0.6714,
+      "step": 1266
+    },
+    {
+      "epoch": 0.22524444444444444,
+      "grad_norm": 0.3889531451696137,
+      "learning_rate": 0.0001806709340473913,
+      "loss": 0.6766,
+      "step": 1267
+    },
+    {
+      "epoch": 0.22542222222222222,
+      "grad_norm": 0.36801762972423285,
+      "learning_rate": 0.0001806368935640378,
+      "loss": 0.6408,
+      "step": 1268
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.3521496509119404,
+      "learning_rate": 0.00018060282634540053,
+      "loss": 0.6401,
+      "step": 1269
+    },
+    {
+      "epoch": 0.22577777777777777,
+      "grad_norm": 0.35311540887409437,
+      "learning_rate": 0.00018056873240277445,
+      "loss": 0.6915,
+      "step": 1270
+    },
+    {
+      "epoch": 0.22595555555555555,
+      "grad_norm": 0.3559641166594349,
+      "learning_rate": 0.0001805346117474635,
+      "loss": 0.6021,
+      "step": 1271
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.3591596263716163,
+      "learning_rate": 0.0001805004643907804,
+      "loss": 0.6685,
+      "step": 1272
+    },
+    {
+      "epoch": 0.2263111111111111,
+      "grad_norm": 0.3707216191459186,
+      "learning_rate": 0.0001804662903440468,
+      "loss": 0.6296,
+      "step": 1273
+    },
+    {
+      "epoch": 0.22648888888888888,
+      "grad_norm": 0.3548733735155643,
+      "learning_rate": 0.00018043208961859316,
+      "loss": 0.6536,
+      "step": 1274
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.36397318383873745,
+      "learning_rate": 0.00018039786222575873,
+      "loss": 0.6476,
+      "step": 1275
+    },
+    {
+      "epoch": 0.22684444444444443,
+      "grad_norm": 0.3748274906465537,
+      "learning_rate": 0.0001803636081768917,
+      "loss": 0.6453,
+      "step": 1276
+    },
+    {
+      "epoch": 0.2270222222222222,
+      "grad_norm": 0.3541633584386624,
+      "learning_rate": 0.00018032932748334902,
+      "loss": 0.6636,
+      "step": 1277
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.365831485623866,
+      "learning_rate": 0.00018029502015649647,
+      "loss": 0.6786,
+      "step": 1278
+    },
+    {
+      "epoch": 0.2273777777777778,
+      "grad_norm": 0.3720993619824261,
+      "learning_rate": 0.00018026068620770883,
+      "loss": 0.6317,
+      "step": 1279
+    },
+    {
+      "epoch": 0.22755555555555557,
+      "grad_norm": 0.36783510120315577,
+      "learning_rate": 0.00018022632564836948,
+      "loss": 0.6508,
+      "step": 1280
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.3834184736054038,
+      "learning_rate": 0.0001801919384898707,
+      "loss": 0.6199,
+      "step": 1281
+    },
+    {
+      "epoch": 0.22791111111111112,
+      "grad_norm": 0.341374217529724,
+      "learning_rate": 0.00018015752474361362,
+      "loss": 0.6355,
+      "step": 1282
+    },
+    {
+      "epoch": 0.2280888888888889,
+      "grad_norm": 0.3871502641429102,
+      "learning_rate": 0.00018012308442100824,
+      "loss": 0.6575,
+      "step": 1283
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.35645642383502985,
+      "learning_rate": 0.00018008861753347316,
+      "loss": 0.635,
+      "step": 1284
+    },
+    {
+      "epoch": 0.22844444444444445,
+      "grad_norm": 0.4270501104834336,
+      "learning_rate": 0.00018005412409243606,
+      "loss": 0.6521,
+      "step": 1285
+    },
+    {
+      "epoch": 0.22862222222222223,
+      "grad_norm": 0.39586136842836317,
+      "learning_rate": 0.0001800196041093332,
+      "loss": 0.6907,
+      "step": 1286
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.359522010241049,
+      "learning_rate": 0.0001799850575956098,
+      "loss": 0.6285,
+      "step": 1287
+    },
+    {
+      "epoch": 0.22897777777777778,
+      "grad_norm": 0.3874640161748343,
+      "learning_rate": 0.0001799504845627198,
+      "loss": 0.6892,
+      "step": 1288
+    },
+    {
+      "epoch": 0.22915555555555556,
+      "grad_norm": 0.36882034365720556,
+      "learning_rate": 0.0001799158850221259,
+      "loss": 0.6547,
+      "step": 1289
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.3820864627123511,
+      "learning_rate": 0.00017988125898529966,
+      "loss": 0.679,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2295111111111111,
+      "grad_norm": 0.35281146556952625,
+      "learning_rate": 0.0001798466064637214,
+      "loss": 0.6781,
+      "step": 1291
+    },
+    {
+      "epoch": 0.2296888888888889,
+      "grad_norm": 0.3713894627884917,
+      "learning_rate": 0.00017981192746888017,
+      "loss": 0.6234,
+      "step": 1292
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.3644697301527836,
+      "learning_rate": 0.0001797772220122739,
+      "loss": 0.6649,
+      "step": 1293
+    },
+    {
+      "epoch": 0.23004444444444444,
+      "grad_norm": 0.3692195806384566,
+      "learning_rate": 0.0001797424901054092,
+      "loss": 0.6924,
+      "step": 1294
+    },
+    {
+      "epoch": 0.23022222222222222,
+      "grad_norm": 0.3868252632234773,
+      "learning_rate": 0.0001797077317598015,
+      "loss": 0.6935,
+      "step": 1295
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.39208838602648277,
+      "learning_rate": 0.000179672946986975,
+      "loss": 0.6657,
+      "step": 1296
+    },
+    {
+      "epoch": 0.23057777777777777,
+      "grad_norm": 0.4082130619109606,
+      "learning_rate": 0.0001796381357984626,
+      "loss": 0.7243,
+      "step": 1297
+    },
+    {
+      "epoch": 0.23075555555555555,
+      "grad_norm": 0.3662372728050405,
+      "learning_rate": 0.00017960329820580607,
+      "loss": 0.6405,
+      "step": 1298
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.36352467148714435,
+      "learning_rate": 0.0001795684342205558,
+      "loss": 0.642,
+      "step": 1299
+    },
+    {
+      "epoch": 0.2311111111111111,
+      "grad_norm": 0.40917616866185247,
+      "learning_rate": 0.000179533543854271,
+      "loss": 0.6205,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23128888888888888,
+      "grad_norm": 0.3842333705992674,
+      "learning_rate": 0.00017949862711851965,
+      "loss": 0.627,
+      "step": 1301
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.3955545266735241,
+      "learning_rate": 0.00017946368402487845,
+      "loss": 0.6832,
+      "step": 1302
+    },
+    {
+      "epoch": 0.23164444444444443,
+      "grad_norm": 0.37563124393497516,
+      "learning_rate": 0.00017942871458493284,
+      "loss": 0.6338,
+      "step": 1303
+    },
+    {
+      "epoch": 0.23182222222222224,
+      "grad_norm": 0.3967110033775136,
+      "learning_rate": 0.00017939371881027697,
+      "loss": 0.6939,
+      "step": 1304
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3516919270429312,
+      "learning_rate": 0.00017935869671251378,
+      "loss": 0.6215,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2321777777777778,
+      "grad_norm": 0.37401797171452594,
+      "learning_rate": 0.0001793236483032548,
+      "loss": 0.6512,
+      "step": 1306
+    },
+    {
+      "epoch": 0.23235555555555557,
+      "grad_norm": 0.3867622724788743,
+      "learning_rate": 0.0001792885735941205,
+      "loss": 0.6782,
+      "step": 1307
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.35924062182946515,
+      "learning_rate": 0.0001792534725967399,
+      "loss": 0.6773,
+      "step": 1308
+    },
+    {
+      "epoch": 0.23271111111111112,
+      "grad_norm": 0.3711545394429619,
+      "learning_rate": 0.00017921834532275076,
+      "loss": 0.6326,
+      "step": 1309
+    },
+    {
+      "epoch": 0.2328888888888889,
+      "grad_norm": 0.3562773430235455,
+      "learning_rate": 0.00017918319178379967,
+      "loss": 0.6511,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.36525151006923434,
+      "learning_rate": 0.00017914801199154175,
+      "loss": 0.6568,
+      "step": 1311
+    },
+    {
+      "epoch": 0.23324444444444445,
+      "grad_norm": 0.3701701674212604,
+      "learning_rate": 0.00017911280595764092,
+      "loss": 0.6281,
+      "step": 1312
+    },
+    {
+      "epoch": 0.23342222222222223,
+      "grad_norm": 0.39619610073025263,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 0.7054,
+      "step": 1313
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3634351419349522,
+      "learning_rate": 0.00017904231521160982,
+      "loss": 0.6867,
+      "step": 1314
+    },
+    {
+      "epoch": 0.23377777777777778,
+      "grad_norm": 0.38291421783768703,
+      "learning_rate": 0.00017900703052285084,
+      "loss": 0.6927,
+      "step": 1315
+    },
+    {
+      "epoch": 0.23395555555555556,
+      "grad_norm": 0.37420360371681227,
+      "learning_rate": 0.0001789717196391916,
+      "loss": 0.6726,
+      "step": 1316
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.3485924585311357,
+      "learning_rate": 0.00017893638257233943,
+      "loss": 0.605,
+      "step": 1317
+    },
+    {
+      "epoch": 0.2343111111111111,
+      "grad_norm": 0.3890514924650713,
+      "learning_rate": 0.00017890101933401047,
+      "loss": 0.684,
+      "step": 1318
+    },
+    {
+      "epoch": 0.23448888888888889,
+      "grad_norm": 0.37679602376150895,
+      "learning_rate": 0.0001788656299359294,
+      "loss": 0.6403,
+      "step": 1319
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.36588941825170834,
+      "learning_rate": 0.00017883021438982964,
+      "loss": 0.654,
+      "step": 1320
+    },
+    {
+      "epoch": 0.23484444444444444,
+      "grad_norm": 0.3762464975943449,
+      "learning_rate": 0.00017879477270745328,
+      "loss": 0.7041,
+      "step": 1321
+    },
+    {
+      "epoch": 0.23502222222222222,
+      "grad_norm": 0.36130975481183664,
+      "learning_rate": 0.00017875930490055106,
+      "loss": 0.6549,
+      "step": 1322
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.37374764256953785,
+      "learning_rate": 0.00017872381098088237,
+      "loss": 0.7119,
+      "step": 1323
+    },
+    {
+      "epoch": 0.23537777777777777,
+      "grad_norm": 0.41445183140938907,
+      "learning_rate": 0.00017868829096021527,
+      "loss": 0.7511,
+      "step": 1324
+    },
+    {
+      "epoch": 0.23555555555555555,
+      "grad_norm": 0.3426309998598964,
+      "learning_rate": 0.0001786527448503265,
+      "loss": 0.6733,
+      "step": 1325
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.3745962267587148,
+      "learning_rate": 0.0001786171726630014,
+      "loss": 0.6558,
+      "step": 1326
+    },
+    {
+      "epoch": 0.2359111111111111,
+      "grad_norm": 0.37760340600558523,
+      "learning_rate": 0.000178581574410034,
+      "loss": 0.7098,
+      "step": 1327
+    },
+    {
+      "epoch": 0.23608888888888888,
+      "grad_norm": 0.3631130973598564,
+      "learning_rate": 0.000178545950103227,
+      "loss": 0.687,
+      "step": 1328
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.37442239653839543,
+      "learning_rate": 0.00017851029975439158,
+      "loss": 0.6873,
+      "step": 1329
+    },
+    {
+      "epoch": 0.23644444444444446,
+      "grad_norm": 0.3544908461425953,
+      "learning_rate": 0.00017847462337534776,
+      "loss": 0.6434,
+      "step": 1330
+    },
+    {
+      "epoch": 0.23662222222222223,
+      "grad_norm": 0.37019313698788225,
+      "learning_rate": 0.00017843892097792408,
+      "loss": 0.6672,
+      "step": 1331
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3760298925859323,
+      "learning_rate": 0.00017840319257395767,
+      "loss": 0.6366,
+      "step": 1332
+    },
+    {
+      "epoch": 0.2369777777777778,
+      "grad_norm": 0.40976279691863876,
+      "learning_rate": 0.0001783674381752944,
+      "loss": 0.6889,
+      "step": 1333
+    },
+    {
+      "epoch": 0.23715555555555556,
+      "grad_norm": 0.3582943729244857,
+      "learning_rate": 0.00017833165779378867,
+      "loss": 0.6395,
+      "step": 1334
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.36225126761600807,
+      "learning_rate": 0.00017829585144130356,
+      "loss": 0.6043,
+      "step": 1335
+    },
+    {
+      "epoch": 0.23751111111111112,
+      "grad_norm": 0.3537156017879337,
+      "learning_rate": 0.00017826001912971066,
+      "loss": 0.6637,
+      "step": 1336
+    },
+    {
+      "epoch": 0.2376888888888889,
+      "grad_norm": 0.36183506689018097,
+      "learning_rate": 0.00017822416087089025,
+      "loss": 0.6818,
+      "step": 1337
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.35941325243183436,
+      "learning_rate": 0.00017818827667673116,
+      "loss": 0.6767,
+      "step": 1338
+    },
+    {
+      "epoch": 0.23804444444444445,
+      "grad_norm": 0.36445531796959796,
+      "learning_rate": 0.00017815236655913092,
+      "loss": 0.6328,
+      "step": 1339
+    },
+    {
+      "epoch": 0.23822222222222222,
+      "grad_norm": 0.3687817642032925,
+      "learning_rate": 0.00017811643052999552,
+      "loss": 0.6465,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.3741595498147515,
+      "learning_rate": 0.0001780804686012396,
+      "loss": 0.6868,
+      "step": 1341
+    },
+    {
+      "epoch": 0.23857777777777778,
+      "grad_norm": 0.3891494595627898,
+      "learning_rate": 0.00017804448078478647,
+      "loss": 0.7196,
+      "step": 1342
+    },
+    {
+      "epoch": 0.23875555555555555,
+      "grad_norm": 0.36015019915748786,
+      "learning_rate": 0.0001780084670925679,
+      "loss": 0.603,
+      "step": 1343
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.3790785524684745,
+      "learning_rate": 0.00017797242753652423,
+      "loss": 0.6807,
+      "step": 1344
+    },
+    {
+      "epoch": 0.2391111111111111,
+      "grad_norm": 0.3609255760488036,
+      "learning_rate": 0.00017793636212860449,
+      "loss": 0.6612,
+      "step": 1345
+    },
+    {
+      "epoch": 0.23928888888888888,
+      "grad_norm": 0.3460993330945266,
+      "learning_rate": 0.0001779002708807662,
+      "loss": 0.6644,
+      "step": 1346
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.39822585606614946,
+      "learning_rate": 0.00017786415380497553,
+      "loss": 0.6763,
+      "step": 1347
+    },
+    {
+      "epoch": 0.23964444444444444,
+      "grad_norm": 0.37981576095414493,
+      "learning_rate": 0.00017782801091320707,
+      "loss": 0.6523,
+      "step": 1348
+    },
+    {
+      "epoch": 0.23982222222222221,
+      "grad_norm": 0.37352954163328894,
+      "learning_rate": 0.00017779184221744404,
+      "loss": 0.6211,
+      "step": 1349
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3817468997975673,
+      "learning_rate": 0.0001777556477296783,
+      "loss": 0.6404,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24017777777777777,
+      "grad_norm": 0.39381365196624535,
+      "learning_rate": 0.00017771942746191014,
+      "loss": 0.6974,
+      "step": 1351
+    },
+    {
+      "epoch": 0.24035555555555554,
+      "grad_norm": 0.4230517518256083,
+      "learning_rate": 0.00017768318142614845,
+      "loss": 0.714,
+      "step": 1352
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.390896453245426,
+      "learning_rate": 0.00017764690963441066,
+      "loss": 0.6898,
+      "step": 1353
+    },
+    {
+      "epoch": 0.2407111111111111,
+      "grad_norm": 0.3942472823419236,
+      "learning_rate": 0.00017761061209872273,
+      "loss": 0.7128,
+      "step": 1354
+    },
+    {
+      "epoch": 0.2408888888888889,
+      "grad_norm": 0.3706793167830517,
+      "learning_rate": 0.00017757428883111918,
+      "loss": 0.6757,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.35878355668735806,
+      "learning_rate": 0.00017753793984364306,
+      "loss": 0.652,
+      "step": 1356
+    },
+    {
+      "epoch": 0.24124444444444446,
+      "grad_norm": 0.5370187234823591,
+      "learning_rate": 0.0001775015651483459,
+      "loss": 0.6293,
+      "step": 1357
+    },
+    {
+      "epoch": 0.24142222222222223,
+      "grad_norm": 0.35793651468723764,
+      "learning_rate": 0.00017746516475728775,
+      "loss": 0.6644,
+      "step": 1358
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.3548400450923138,
+      "learning_rate": 0.0001774287386825373,
+      "loss": 0.6407,
+      "step": 1359
+    },
+    {
+      "epoch": 0.24177777777777779,
+      "grad_norm": 0.40336711959002175,
+      "learning_rate": 0.0001773922869361716,
+      "loss": 0.7308,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24195555555555556,
+      "grad_norm": 0.34835124751478924,
+      "learning_rate": 0.00017735580953027636,
+      "loss": 0.6656,
+      "step": 1361
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.3603559317123436,
+      "learning_rate": 0.0001773193064769456,
+      "loss": 0.6582,
+      "step": 1362
+    },
+    {
+      "epoch": 0.24231111111111112,
+      "grad_norm": 0.349635256140617,
+      "learning_rate": 0.0001772827777882821,
+      "loss": 0.619,
+      "step": 1363
+    },
+    {
+      "epoch": 0.2424888888888889,
+      "grad_norm": 0.3533609881973546,
+      "learning_rate": 0.00017724622347639688,
+      "loss": 0.6738,
+      "step": 1364
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.35940222120923704,
+      "learning_rate": 0.00017720964355340962,
+      "loss": 0.6313,
+      "step": 1365
+    },
+    {
+      "epoch": 0.24284444444444445,
+      "grad_norm": 0.35559852737944414,
+      "learning_rate": 0.00017717303803144852,
+      "loss": 0.641,
+      "step": 1366
+    },
+    {
+      "epoch": 0.24302222222222222,
+      "grad_norm": 0.35218243384859543,
+      "learning_rate": 0.00017713640692265008,
+      "loss": 0.6587,
+      "step": 1367
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3639913447558467,
+      "learning_rate": 0.00017709975023915949,
+      "loss": 0.6391,
+      "step": 1368
+    },
+    {
+      "epoch": 0.24337777777777778,
+      "grad_norm": 0.382021541566799,
+      "learning_rate": 0.00017706306799313026,
+      "loss": 0.6723,
+      "step": 1369
+    },
+    {
+      "epoch": 0.24355555555555555,
+      "grad_norm": 0.3728795948978041,
+      "learning_rate": 0.0001770263601967245,
+      "loss": 0.6647,
+      "step": 1370
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.37386094141962695,
+      "learning_rate": 0.00017698962686211268,
+      "loss": 0.6659,
+      "step": 1371
+    },
+    {
+      "epoch": 0.2439111111111111,
+      "grad_norm": 0.5491656199225733,
+      "learning_rate": 0.0001769528680014739,
+      "loss": 0.6503,
+      "step": 1372
+    },
+    {
+      "epoch": 0.24408888888888888,
+      "grad_norm": 0.3747091180279064,
+      "learning_rate": 0.00017691608362699546,
+      "loss": 0.6691,
+      "step": 1373
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.36053556512307666,
+      "learning_rate": 0.00017687927375087338,
+      "loss": 0.6693,
+      "step": 1374
+    },
+    {
+      "epoch": 0.24444444444444444,
+      "grad_norm": 0.35392200268040286,
+      "learning_rate": 0.000176842438385312,
+      "loss": 0.669,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2446222222222222,
+      "grad_norm": 0.3515524141662454,
+      "learning_rate": 0.00017680557754252418,
+      "loss": 0.6504,
+      "step": 1376
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3622308577856946,
+      "learning_rate": 0.00017676869123473113,
+      "loss": 0.64,
+      "step": 1377
+    },
+    {
+      "epoch": 0.24497777777777777,
+      "grad_norm": 0.3752560157308086,
+      "learning_rate": 0.00017673177947416258,
+      "loss": 0.6691,
+      "step": 1378
+    },
+    {
+      "epoch": 0.24515555555555554,
+      "grad_norm": 0.5522802115221203,
+      "learning_rate": 0.0001766948422730567,
+      "loss": 0.71,
+      "step": 1379
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.3590664599368048,
+      "learning_rate": 0.00017665787964366006,
+      "loss": 0.6683,
+      "step": 1380
+    },
+    {
+      "epoch": 0.24551111111111112,
+      "grad_norm": 0.3619576043511919,
+      "learning_rate": 0.00017662089159822765,
+      "loss": 0.6669,
+      "step": 1381
+    },
+    {
+      "epoch": 0.2456888888888889,
+      "grad_norm": 0.3410550269326608,
+      "learning_rate": 0.00017658387814902294,
+      "loss": 0.6366,
+      "step": 1382
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.3817131022287267,
+      "learning_rate": 0.00017654683930831783,
+      "loss": 0.6605,
+      "step": 1383
+    },
+    {
+      "epoch": 0.24604444444444445,
+      "grad_norm": 0.38586700251304207,
+      "learning_rate": 0.00017650977508839254,
+      "loss": 0.642,
+      "step": 1384
+    },
+    {
+      "epoch": 0.24622222222222223,
+      "grad_norm": 0.36425469664368254,
+      "learning_rate": 0.00017647268550153583,
+      "loss": 0.6774,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.38394910279537725,
+      "learning_rate": 0.00017643557056004473,
+      "loss": 0.7108,
+      "step": 1386
+    },
+    {
+      "epoch": 0.24657777777777778,
+      "grad_norm": 0.3777166576941319,
+      "learning_rate": 0.0001763984302762248,
+      "loss": 0.7045,
+      "step": 1387
+    },
+    {
+      "epoch": 0.24675555555555556,
+      "grad_norm": 0.3521410536631835,
+      "learning_rate": 0.00017636126466238995,
+      "loss": 0.6311,
+      "step": 1388
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.36195372691412486,
+      "learning_rate": 0.00017632407373086256,
+      "loss": 0.6433,
+      "step": 1389
+    },
+    {
+      "epoch": 0.24711111111111111,
+      "grad_norm": 0.3781227718722322,
+      "learning_rate": 0.0001762868574939732,
+      "loss": 0.683,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2472888888888889,
+      "grad_norm": 0.3657836825496804,
+      "learning_rate": 0.0001762496159640611,
+      "loss": 0.6403,
+      "step": 1391
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.3980086528061227,
+      "learning_rate": 0.00017621234915347368,
+      "loss": 0.7235,
+      "step": 1392
+    },
+    {
+      "epoch": 0.24764444444444444,
+      "grad_norm": 0.39661278406014594,
+      "learning_rate": 0.00017617505707456682,
+      "loss": 0.6645,
+      "step": 1393
+    },
+    {
+      "epoch": 0.24782222222222222,
+      "grad_norm": 0.4017342434831154,
+      "learning_rate": 0.00017613773973970478,
+      "loss": 0.6488,
+      "step": 1394
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.3844575421862563,
+      "learning_rate": 0.00017610039716126018,
+      "loss": 0.6454,
+      "step": 1395
+    },
+    {
+      "epoch": 0.24817777777777777,
+      "grad_norm": 0.36205821497124974,
+      "learning_rate": 0.00017606302935161395,
+      "loss": 0.6677,
+      "step": 1396
+    },
+    {
+      "epoch": 0.24835555555555555,
+      "grad_norm": 0.3778362662591267,
+      "learning_rate": 0.00017602563632315553,
+      "loss": 0.656,
+      "step": 1397
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.3503277617173709,
+      "learning_rate": 0.0001759882180882826,
+      "loss": 0.6334,
+      "step": 1398
+    },
+    {
+      "epoch": 0.2487111111111111,
+      "grad_norm": 0.38157367612851106,
+      "learning_rate": 0.00017595077465940118,
+      "loss": 0.6781,
+      "step": 1399
+    },
+    {
+      "epoch": 0.24888888888888888,
+      "grad_norm": 0.37426016440352167,
+      "learning_rate": 0.00017591330604892574,
+      "loss": 0.6494,
+      "step": 1400
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.3907342559241473,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 0.6914,
+      "step": 1401
+    },
+    {
+      "epoch": 0.24924444444444444,
+      "grad_norm": 0.38123067361092744,
+      "learning_rate": 0.0001758382933328923,
+      "loss": 0.6141,
+      "step": 1402
+    },
+    {
+      "epoch": 0.2494222222222222,
+      "grad_norm": 0.4038856882325022,
+      "learning_rate": 0.00017580074925220487,
+      "loss": 0.7019,
+      "step": 1403
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.36669749830297804,
+      "learning_rate": 0.00017576318003966455,
+      "loss": 0.6717,
+      "step": 1404
+    },
+    {
+      "epoch": 0.24977777777777777,
+      "grad_norm": 0.39651044883944064,
+      "learning_rate": 0.0001757255857077275,
+      "loss": 0.6905,
+      "step": 1405
+    },
+    {
+      "epoch": 0.24995555555555554,
+      "grad_norm": 0.3686611708426602,
+      "learning_rate": 0.00017568796626885814,
+      "loss": 0.6423,
+      "step": 1406
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.37802649141783823,
+      "learning_rate": 0.0001756503217355293,
+      "loss": 0.6451,
+      "step": 1407
+    },
+    {
+      "epoch": 0.2503111111111111,
+      "grad_norm": 0.3887471225498165,
+      "learning_rate": 0.00017561265212022206,
+      "loss": 0.6882,
+      "step": 1408
+    },
+    {
+      "epoch": 0.25048888888888887,
+      "grad_norm": 0.3909425460101287,
+      "learning_rate": 0.00017557495743542585,
+      "loss": 0.6854,
+      "step": 1409
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.36337171937273804,
+      "learning_rate": 0.00017553723769363837,
+      "loss": 0.6894,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2508444444444444,
+      "grad_norm": 0.3650226302164386,
+      "learning_rate": 0.00017549949290736566,
+      "loss": 0.6324,
+      "step": 1411
+    },
+    {
+      "epoch": 0.2510222222222222,
+      "grad_norm": 0.3758796481662887,
+      "learning_rate": 0.00017546172308912213,
+      "loss": 0.6883,
+      "step": 1412
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3729612994454608,
+      "learning_rate": 0.00017542392825143033,
+      "loss": 0.6977,
+      "step": 1413
+    },
+    {
+      "epoch": 0.25137777777777776,
+      "grad_norm": 0.3581757788334183,
+      "learning_rate": 0.00017538610840682126,
+      "loss": 0.6138,
+      "step": 1414
+    },
+    {
+      "epoch": 0.25155555555555553,
+      "grad_norm": 0.37614567145559047,
+      "learning_rate": 0.0001753482635678341,
+      "loss": 0.6497,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.39467932742866707,
+      "learning_rate": 0.00017531039374701636,
+      "loss": 0.699,
+      "step": 1416
+    },
+    {
+      "epoch": 0.2519111111111111,
+      "grad_norm": 0.38233021829393277,
+      "learning_rate": 0.0001752724989569239,
+      "loss": 0.673,
+      "step": 1417
+    },
+    {
+      "epoch": 0.25208888888888886,
+      "grad_norm": 0.36017435952321125,
+      "learning_rate": 0.00017523457921012075,
+      "loss": 0.6234,
+      "step": 1418
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.3694854023500707,
+      "learning_rate": 0.00017519663451917925,
+      "loss": 0.6197,
+      "step": 1419
+    },
+    {
+      "epoch": 0.25244444444444447,
+      "grad_norm": 0.3778772339124559,
+      "learning_rate": 0.00017515866489668005,
+      "loss": 0.6179,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25262222222222225,
+      "grad_norm": 0.36663263378993366,
+      "learning_rate": 0.000175120670355212,
+      "loss": 0.6412,
+      "step": 1421
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4006414162543807,
+      "learning_rate": 0.00017508265090737226,
+      "loss": 0.6744,
+      "step": 1422
+    },
+    {
+      "epoch": 0.2529777777777778,
+      "grad_norm": 0.3807019198494924,
+      "learning_rate": 0.00017504460656576627,
+      "loss": 0.6377,
+      "step": 1423
+    },
+    {
+      "epoch": 0.2531555555555556,
+      "grad_norm": 0.38091276177551353,
+      "learning_rate": 0.00017500653734300764,
+      "loss": 0.6234,
+      "step": 1424
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.35036734350459603,
+      "learning_rate": 0.00017496844325171827,
+      "loss": 0.6522,
+      "step": 1425
+    },
+    {
+      "epoch": 0.25351111111111113,
+      "grad_norm": 0.3573699751241585,
+      "learning_rate": 0.00017493032430452842,
+      "loss": 0.6446,
+      "step": 1426
+    },
+    {
+      "epoch": 0.2536888888888889,
+      "grad_norm": 0.4047691076617589,
+      "learning_rate": 0.00017489218051407638,
+      "loss": 0.636,
+      "step": 1427
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.3844335227443174,
+      "learning_rate": 0.00017485401189300877,
+      "loss": 0.6827,
+      "step": 1428
+    },
+    {
+      "epoch": 0.25404444444444446,
+      "grad_norm": 0.37740262108710376,
+      "learning_rate": 0.0001748158184539805,
+      "loss": 0.6599,
+      "step": 1429
+    },
+    {
+      "epoch": 0.25422222222222224,
+      "grad_norm": 0.3784014533076855,
+      "learning_rate": 0.00017477760020965467,
+      "loss": 0.6817,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.3830865036609814,
+      "learning_rate": 0.00017473935717270258,
+      "loss": 0.6998,
+      "step": 1431
+    },
+    {
+      "epoch": 0.2545777777777778,
+      "grad_norm": 0.4155168127586694,
+      "learning_rate": 0.00017470108935580377,
+      "loss": 0.6993,
+      "step": 1432
+    },
+    {
+      "epoch": 0.25475555555555557,
+      "grad_norm": 0.37379696886003255,
+      "learning_rate": 0.000174662796771646,
+      "loss": 0.6519,
+      "step": 1433
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.3631844001484389,
+      "learning_rate": 0.0001746244794329252,
+      "loss": 0.6597,
+      "step": 1434
+    },
+    {
+      "epoch": 0.2551111111111111,
+      "grad_norm": 0.3649259411940409,
+      "learning_rate": 0.0001745861373523456,
+      "loss": 0.6249,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2552888888888889,
+      "grad_norm": 0.36520081445665964,
+      "learning_rate": 0.0001745477705426195,
+      "loss": 0.6369,
+      "step": 1436
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.3768838955651367,
+      "learning_rate": 0.00017450937901646754,
+      "loss": 0.6751,
+      "step": 1437
+    },
+    {
+      "epoch": 0.25564444444444445,
+      "grad_norm": 0.3715771614697907,
+      "learning_rate": 0.00017447096278661844,
+      "loss": 0.6509,
+      "step": 1438
+    },
+    {
+      "epoch": 0.25582222222222223,
+      "grad_norm": 0.3693045630748999,
+      "learning_rate": 0.00017443252186580922,
+      "loss": 0.6625,
+      "step": 1439
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.35314560598286976,
+      "learning_rate": 0.00017439405626678496,
+      "loss": 0.6471,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2561777777777778,
+      "grad_norm": 0.3615879788144135,
+      "learning_rate": 0.00017435556600229902,
+      "loss": 0.6513,
+      "step": 1441
+    },
+    {
+      "epoch": 0.25635555555555556,
+      "grad_norm": 0.3635929590305757,
+      "learning_rate": 0.0001743170510851129,
+      "loss": 0.6301,
+      "step": 1442
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.37253753032719467,
+      "learning_rate": 0.00017427851152799627,
+      "loss": 0.6642,
+      "step": 1443
+    },
+    {
+      "epoch": 0.2567111111111111,
+      "grad_norm": 0.3637263808569345,
+      "learning_rate": 0.000174239947343727,
+      "loss": 0.656,
+      "step": 1444
+    },
+    {
+      "epoch": 0.2568888888888889,
+      "grad_norm": 0.3472046979233085,
+      "learning_rate": 0.0001742013585450911,
+      "loss": 0.6484,
+      "step": 1445
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.3866906466435054,
+      "learning_rate": 0.0001741627451448827,
+      "loss": 0.6714,
+      "step": 1446
+    },
+    {
+      "epoch": 0.25724444444444444,
+      "grad_norm": 0.35997096209171037,
+      "learning_rate": 0.0001741241071559042,
+      "loss": 0.6864,
+      "step": 1447
+    },
+    {
+      "epoch": 0.2574222222222222,
+      "grad_norm": 0.3817306868428725,
+      "learning_rate": 0.00017408544459096605,
+      "loss": 0.6751,
+      "step": 1448
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.3606513883553394,
+      "learning_rate": 0.00017404675746288687,
+      "loss": 0.6356,
+      "step": 1449
+    },
+    {
+      "epoch": 0.2577777777777778,
+      "grad_norm": 0.36828163234806877,
+      "learning_rate": 0.00017400804578449343,
+      "loss": 0.6605,
+      "step": 1450
+    },
+    {
+      "epoch": 0.25795555555555555,
+      "grad_norm": 0.36476502065513966,
+      "learning_rate": 0.00017396930956862068,
+      "loss": 0.6872,
+      "step": 1451
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.38090128506740734,
+      "learning_rate": 0.00017393054882811168,
+      "loss": 0.724,
+      "step": 1452
+    },
+    {
+      "epoch": 0.2583111111111111,
+      "grad_norm": 0.3684218693982843,
+      "learning_rate": 0.00017389176357581753,
+      "loss": 0.6441,
+      "step": 1453
+    },
+    {
+      "epoch": 0.2584888888888889,
+      "grad_norm": 0.373349333221224,
+      "learning_rate": 0.00017385295382459765,
+      "loss": 0.6777,
+      "step": 1454
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.3972388204495291,
+      "learning_rate": 0.0001738141195873194,
+      "loss": 0.7323,
+      "step": 1455
+    },
+    {
+      "epoch": 0.25884444444444443,
+      "grad_norm": 0.37455826557407396,
+      "learning_rate": 0.00017377526087685832,
+      "loss": 0.6939,
+      "step": 1456
+    },
+    {
+      "epoch": 0.2590222222222222,
+      "grad_norm": 0.36104000718727974,
+      "learning_rate": 0.0001737363777060981,
+      "loss": 0.6532,
+      "step": 1457
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3782882968739907,
+      "learning_rate": 0.00017369747008793055,
+      "loss": 0.6211,
+      "step": 1458
+    },
+    {
+      "epoch": 0.25937777777777776,
+      "grad_norm": 0.3692613502820997,
+      "learning_rate": 0.00017365853803525552,
+      "loss": 0.7224,
+      "step": 1459
+    },
+    {
+      "epoch": 0.25955555555555554,
+      "grad_norm": 0.3765006906050503,
+      "learning_rate": 0.00017361958156098095,
+      "loss": 0.7001,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.37263894045485085,
+      "learning_rate": 0.00017358060067802295,
+      "loss": 0.6557,
+      "step": 1461
+    },
+    {
+      "epoch": 0.2599111111111111,
+      "grad_norm": 0.36340261864875034,
+      "learning_rate": 0.00017354159539930572,
+      "loss": 0.6379,
+      "step": 1462
+    },
+    {
+      "epoch": 0.26008888888888887,
+      "grad_norm": 0.36738390711537466,
+      "learning_rate": 0.00017350256573776148,
+      "loss": 0.6296,
+      "step": 1463
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.37171224467902014,
+      "learning_rate": 0.0001734635117063306,
+      "loss": 0.6472,
+      "step": 1464
+    },
+    {
+      "epoch": 0.2604444444444444,
+      "grad_norm": 0.3523571609245952,
+      "learning_rate": 0.00017342443331796147,
+      "loss": 0.6,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2606222222222222,
+      "grad_norm": 0.3503674568803646,
+      "learning_rate": 0.0001733853305856106,
+      "loss": 0.6437,
+      "step": 1466
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4598068073300456,
+      "learning_rate": 0.0001733462035222426,
+      "loss": 0.6657,
+      "step": 1467
+    },
+    {
+      "epoch": 0.26097777777777775,
+      "grad_norm": 0.3944663606343608,
+      "learning_rate": 0.00017330705214083005,
+      "loss": 0.7078,
+      "step": 1468
+    },
+    {
+      "epoch": 0.26115555555555553,
+      "grad_norm": 0.38551066923302413,
+      "learning_rate": 0.0001732678764543537,
+      "loss": 0.7162,
+      "step": 1469
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.416820964935935,
+      "learning_rate": 0.00017322867647580226,
+      "loss": 0.6674,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26151111111111114,
+      "grad_norm": 0.3558581693437286,
+      "learning_rate": 0.00017318945221817255,
+      "loss": 0.6441,
+      "step": 1471
+    },
+    {
+      "epoch": 0.2616888888888889,
+      "grad_norm": 0.39317157498451755,
+      "learning_rate": 0.00017315020369446945,
+      "loss": 0.712,
+      "step": 1472
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.3591427705192805,
+      "learning_rate": 0.00017311093091770588,
+      "loss": 0.6934,
+      "step": 1473
+    },
+    {
+      "epoch": 0.26204444444444447,
+      "grad_norm": 0.36503262368115585,
+      "learning_rate": 0.00017307163390090278,
+      "loss": 0.6618,
+      "step": 1474
+    },
+    {
+      "epoch": 0.26222222222222225,
+      "grad_norm": 0.37082483529626886,
+      "learning_rate": 0.0001730323126570891,
+      "loss": 0.6392,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.41203787591515023,
+      "learning_rate": 0.0001729929671993019,
+      "loss": 0.6305,
+      "step": 1476
+    },
+    {
+      "epoch": 0.2625777777777778,
+      "grad_norm": 0.38123838480997874,
+      "learning_rate": 0.0001729535975405862,
+      "loss": 0.6706,
+      "step": 1477
+    },
+    {
+      "epoch": 0.2627555555555556,
+      "grad_norm": 0.36102807309951895,
+      "learning_rate": 0.0001729142036939951,
+      "loss": 0.6064,
+      "step": 1478
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.4107985758210903,
+      "learning_rate": 0.00017287478567258965,
+      "loss": 0.6272,
+      "step": 1479
+    },
+    {
+      "epoch": 0.26311111111111113,
+      "grad_norm": 0.39509203339984084,
+      "learning_rate": 0.00017283534348943897,
+      "loss": 0.6929,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2632888888888889,
+      "grad_norm": 0.37924882565345963,
+      "learning_rate": 0.00017279587715762022,
+      "loss": 0.6803,
+      "step": 1481
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.34672286380281186,
+      "learning_rate": 0.00017275638669021846,
+      "loss": 0.6011,
+      "step": 1482
+    },
+    {
+      "epoch": 0.26364444444444446,
+      "grad_norm": 0.3497174725947891,
+      "learning_rate": 0.0001727168721003268,
+      "loss": 0.6181,
+      "step": 1483
+    },
+    {
+      "epoch": 0.26382222222222224,
+      "grad_norm": 0.3959601690508927,
+      "learning_rate": 0.00017267733340104645,
+      "loss": 0.6913,
+      "step": 1484
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3621608504877059,
+      "learning_rate": 0.00017263777060548644,
+      "loss": 0.6481,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2641777777777778,
+      "grad_norm": 0.3838358113543457,
+      "learning_rate": 0.00017259818372676394,
+      "loss": 0.6555,
+      "step": 1486
+    },
+    {
+      "epoch": 0.26435555555555557,
+      "grad_norm": 0.36512524928546364,
+      "learning_rate": 0.00017255857277800396,
+      "loss": 0.6271,
+      "step": 1487
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.4078323042994446,
+      "learning_rate": 0.00017251893777233966,
+      "loss": 0.7055,
+      "step": 1488
+    },
+    {
+      "epoch": 0.2647111111111111,
+      "grad_norm": 0.37516990586718085,
+      "learning_rate": 0.000172479278722912,
+      "loss": 0.6598,
+      "step": 1489
+    },
+    {
+      "epoch": 0.2648888888888889,
+      "grad_norm": 0.40777636726708033,
+      "learning_rate": 0.00017243959564287008,
+      "loss": 0.6943,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.5733994481955946,
+      "learning_rate": 0.00017239988854537083,
+      "loss": 0.6834,
+      "step": 1491
+    },
+    {
+      "epoch": 0.26524444444444445,
+      "grad_norm": 0.38818957263294324,
+      "learning_rate": 0.00017236015744357918,
+      "loss": 0.6485,
+      "step": 1492
+    },
+    {
+      "epoch": 0.2654222222222222,
+      "grad_norm": 0.3658027457383031,
+      "learning_rate": 0.0001723204023506681,
+      "loss": 0.642,
+      "step": 1493
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.38183893139070835,
+      "learning_rate": 0.00017228062327981846,
+      "loss": 0.6471,
+      "step": 1494
+    },
+    {
+      "epoch": 0.2657777777777778,
+      "grad_norm": 0.37868478985993326,
+      "learning_rate": 0.000172240820244219,
+      "loss": 0.6574,
+      "step": 1495
+    },
+    {
+      "epoch": 0.26595555555555556,
+      "grad_norm": 0.35590937191998534,
+      "learning_rate": 0.0001722009932570665,
+      "loss": 0.6243,
+      "step": 1496
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.35817749953144884,
+      "learning_rate": 0.00017216114233156566,
+      "loss": 0.6694,
+      "step": 1497
+    },
+    {
+      "epoch": 0.2663111111111111,
+      "grad_norm": 0.3682189663181524,
+      "learning_rate": 0.00017212126748092916,
+      "loss": 0.6788,
+      "step": 1498
+    },
+    {
+      "epoch": 0.2664888888888889,
+      "grad_norm": 0.3702648109099891,
+      "learning_rate": 0.0001720813687183775,
+      "loss": 0.6544,
+      "step": 1499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.3516858030245012,
+      "learning_rate": 0.0001720414460571392,
+      "loss": 0.6514,
+      "step": 1500
+    },
+    {
+      "epoch": 0.26684444444444444,
+      "grad_norm": 0.3970600285967603,
+      "learning_rate": 0.00017200149951045068,
+      "loss": 0.6893,
+      "step": 1501
+    },
+    {
+      "epoch": 0.2670222222222222,
+      "grad_norm": 0.36459457092143216,
+      "learning_rate": 0.00017196152909155628,
+      "loss": 0.6925,
+      "step": 1502
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.375708229588632,
+      "learning_rate": 0.0001719215348137083,
+      "loss": 0.6605,
+      "step": 1503
+    },
+    {
+      "epoch": 0.26737777777777777,
+      "grad_norm": 0.35337184864383386,
+      "learning_rate": 0.00017188151669016678,
+      "loss": 0.6554,
+      "step": 1504
+    },
+    {
+      "epoch": 0.26755555555555555,
+      "grad_norm": 0.3666502990516845,
+      "learning_rate": 0.00017184147473419992,
+      "loss": 0.6228,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.3684981108765006,
+      "learning_rate": 0.00017180140895908363,
+      "loss": 0.6186,
+      "step": 1506
+    },
+    {
+      "epoch": 0.2679111111111111,
+      "grad_norm": 0.36503252371701667,
+      "learning_rate": 0.00017176131937810175,
+      "loss": 0.6921,
+      "step": 1507
+    },
+    {
+      "epoch": 0.2680888888888889,
+      "grad_norm": 0.3782745121888272,
+      "learning_rate": 0.0001717212060045461,
+      "loss": 0.6557,
+      "step": 1508
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.3443728763321807,
+      "learning_rate": 0.00017168106885171632,
+      "loss": 0.6353,
+      "step": 1509
+    },
+    {
+      "epoch": 0.26844444444444443,
+      "grad_norm": 0.34783461804761934,
+      "learning_rate": 0.0001716409079329199,
+      "loss": 0.6489,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2686222222222222,
+      "grad_norm": 0.380783713498233,
+      "learning_rate": 0.0001716007232614723,
+      "loss": 0.6725,
+      "step": 1511
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.35949483830814144,
+      "learning_rate": 0.0001715605148506968,
+      "loss": 0.6503,
+      "step": 1512
+    },
+    {
+      "epoch": 0.26897777777777776,
+      "grad_norm": 0.3787937251536407,
+      "learning_rate": 0.00017152028271392452,
+      "loss": 0.6362,
+      "step": 1513
+    },
+    {
+      "epoch": 0.26915555555555554,
+      "grad_norm": 0.35752388274199165,
+      "learning_rate": 0.00017148002686449455,
+      "loss": 0.6802,
+      "step": 1514
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.3631834986468229,
+      "learning_rate": 0.00017143974731575372,
+      "loss": 0.6426,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2695111111111111,
+      "grad_norm": 0.3716725179700024,
+      "learning_rate": 0.00017139944408105676,
+      "loss": 0.6634,
+      "step": 1516
+    },
+    {
+      "epoch": 0.26968888888888887,
+      "grad_norm": 0.37151151693173484,
+      "learning_rate": 0.00017135911717376637,
+      "loss": 0.6523,
+      "step": 1517
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.3537830104493746,
+      "learning_rate": 0.0001713187666072529,
+      "loss": 0.6381,
+      "step": 1518
+    },
+    {
+      "epoch": 0.2700444444444444,
+      "grad_norm": 0.37948907695017964,
+      "learning_rate": 0.0001712783923948947,
+      "loss": 0.6873,
+      "step": 1519
+    },
+    {
+      "epoch": 0.2702222222222222,
+      "grad_norm": 0.3589776462448231,
+      "learning_rate": 0.00017123799455007785,
+      "loss": 0.6103,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.38306600167977467,
+      "learning_rate": 0.00017119757308619639,
+      "loss": 0.6686,
+      "step": 1521
+    },
+    {
+      "epoch": 0.27057777777777775,
+      "grad_norm": 0.38983566563127414,
+      "learning_rate": 0.000171157128016652,
+      "loss": 0.6767,
+      "step": 1522
+    },
+    {
+      "epoch": 0.2707555555555556,
+      "grad_norm": 0.40123979606818033,
+      "learning_rate": 0.00017111665935485443,
+      "loss": 0.6556,
+      "step": 1523
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.36046598015206405,
+      "learning_rate": 0.00017107616711422102,
+      "loss": 0.6521,
+      "step": 1524
+    },
+    {
+      "epoch": 0.27111111111111114,
+      "grad_norm": 0.3746026331035882,
+      "learning_rate": 0.00017103565130817714,
+      "loss": 0.6776,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2712888888888889,
+      "grad_norm": 0.36511639433237236,
+      "learning_rate": 0.00017099511195015575,
+      "loss": 0.6519,
+      "step": 1526
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.3633731597608729,
+      "learning_rate": 0.00017095454905359785,
+      "loss": 0.6659,
+      "step": 1527
+    },
+    {
+      "epoch": 0.27164444444444447,
+      "grad_norm": 0.38796582885425795,
+      "learning_rate": 0.00017091396263195204,
+      "loss": 0.6976,
+      "step": 1528
+    },
+    {
+      "epoch": 0.27182222222222224,
+      "grad_norm": 0.38319622356798844,
+      "learning_rate": 0.00017087335269867483,
+      "loss": 0.6914,
+      "step": 1529
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.386540050254295,
+      "learning_rate": 0.00017083271926723054,
+      "loss": 0.6741,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2721777777777778,
+      "grad_norm": 0.35020812187299805,
+      "learning_rate": 0.00017079206235109124,
+      "loss": 0.7154,
+      "step": 1531
+    },
+    {
+      "epoch": 0.2723555555555556,
+      "grad_norm": 0.36149340273852243,
+      "learning_rate": 0.00017075138196373675,
+      "loss": 0.6468,
+      "step": 1532
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.3697273524960641,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.6594,
+      "step": 1533
+    },
+    {
+      "epoch": 0.2727111111111111,
+      "grad_norm": 0.3639779555457213,
+      "learning_rate": 0.00017066995082934067,
+      "loss": 0.7021,
+      "step": 1534
+    },
+    {
+      "epoch": 0.2728888888888889,
+      "grad_norm": 0.3501220217072064,
+      "learning_rate": 0.00017062920010929767,
+      "loss": 0.6239,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.37338940495234596,
+      "learning_rate": 0.00017058842597203672,
+      "loss": 0.6722,
+      "step": 1536
+    },
+    {
+      "epoch": 0.27324444444444446,
+      "grad_norm": 0.3901137584685992,
+      "learning_rate": 0.00017054762843107658,
+      "loss": 0.6898,
+      "step": 1537
+    },
+    {
+      "epoch": 0.27342222222222223,
+      "grad_norm": 0.3940088695648882,
+      "learning_rate": 0.00017050680749994369,
+      "loss": 0.6576,
+      "step": 1538
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.3777716988422191,
+      "learning_rate": 0.0001704659631921723,
+      "loss": 0.6055,
+      "step": 1539
+    },
+    {
+      "epoch": 0.2737777777777778,
+      "grad_norm": 0.4085346731201099,
+      "learning_rate": 0.00017042509552130444,
+      "loss": 0.6651,
+      "step": 1540
+    },
+    {
+      "epoch": 0.27395555555555556,
+      "grad_norm": 0.3634068203641392,
+      "learning_rate": 0.00017038420450088981,
+      "loss": 0.6514,
+      "step": 1541
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.3529669987589849,
+      "learning_rate": 0.0001703432901444859,
+      "loss": 0.6088,
+      "step": 1542
+    },
+    {
+      "epoch": 0.2743111111111111,
+      "grad_norm": 0.38791716697186734,
+      "learning_rate": 0.00017030235246565795,
+      "loss": 0.6367,
+      "step": 1543
+    },
+    {
+      "epoch": 0.2744888888888889,
+      "grad_norm": 0.36447585530490223,
+      "learning_rate": 0.0001702613914779789,
+      "loss": 0.6303,
+      "step": 1544
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.3686450777894795,
+      "learning_rate": 0.00017022040719502933,
+      "loss": 0.598,
+      "step": 1545
+    },
+    {
+      "epoch": 0.27484444444444445,
+      "grad_norm": 0.36849929161762734,
+      "learning_rate": 0.0001701793996303978,
+      "loss": 0.6524,
+      "step": 1546
+    },
+    {
+      "epoch": 0.2750222222222222,
+      "grad_norm": 0.3645542029984298,
+      "learning_rate": 0.00017013836879768035,
+      "loss": 0.6406,
+      "step": 1547
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.35776886253842693,
+      "learning_rate": 0.00017009731471048081,
+      "loss": 0.6372,
+      "step": 1548
+    },
+    {
+      "epoch": 0.2753777777777778,
+      "grad_norm": 0.39784186436800856,
+      "learning_rate": 0.00017005623738241074,
+      "loss": 0.6988,
+      "step": 1549
+    },
+    {
+      "epoch": 0.27555555555555555,
+      "grad_norm": 0.3613603388204999,
+      "learning_rate": 0.00017001513682708938,
+      "loss": 0.6602,
+      "step": 1550
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.3643154210364165,
+      "learning_rate": 0.00016997401305814371,
+      "loss": 0.6586,
+      "step": 1551
+    },
+    {
+      "epoch": 0.2759111111111111,
+      "grad_norm": 0.3620172362211282,
+      "learning_rate": 0.00016993286608920833,
+      "loss": 0.6389,
+      "step": 1552
+    },
+    {
+      "epoch": 0.2760888888888889,
+      "grad_norm": 0.3478875358901487,
+      "learning_rate": 0.0001698916959339256,
+      "loss": 0.6715,
+      "step": 1553
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.4116150103202986,
+      "learning_rate": 0.00016985050260594556,
+      "loss": 0.6675,
+      "step": 1554
+    },
+    {
+      "epoch": 0.27644444444444444,
+      "grad_norm": 0.37958738781837387,
+      "learning_rate": 0.0001698092861189259,
+      "loss": 0.6438,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2766222222222222,
+      "grad_norm": 0.3594664827463607,
+      "learning_rate": 0.00016976804648653204,
+      "loss": 0.6508,
+      "step": 1556
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.3867221588300807,
+      "learning_rate": 0.00016972678372243703,
+      "loss": 0.7147,
+      "step": 1557
+    },
+    {
+      "epoch": 0.27697777777777777,
+      "grad_norm": 0.37346307836797943,
+      "learning_rate": 0.00016968549784032155,
+      "loss": 0.6362,
+      "step": 1558
+    },
+    {
+      "epoch": 0.27715555555555554,
+      "grad_norm": 0.3929228850167982,
+      "learning_rate": 0.0001696441888538741,
+      "loss": 0.7053,
+      "step": 1559
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.3569171020247684,
+      "learning_rate": 0.0001696028567767906,
+      "loss": 0.647,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2775111111111111,
+      "grad_norm": 0.3449446587472281,
+      "learning_rate": 0.0001695615016227749,
+      "loss": 0.6413,
+      "step": 1561
+    },
+    {
+      "epoch": 0.2776888888888889,
+      "grad_norm": 0.3529974545581637,
+      "learning_rate": 0.0001695201234055383,
+      "loss": 0.6408,
+      "step": 1562
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.3553907525760485,
+      "learning_rate": 0.0001694787221387998,
+      "loss": 0.644,
+      "step": 1563
+    },
+    {
+      "epoch": 0.2780444444444444,
+      "grad_norm": 0.35211306907171414,
+      "learning_rate": 0.00016943729783628608,
+      "loss": 0.6302,
+      "step": 1564
+    },
+    {
+      "epoch": 0.2782222222222222,
+      "grad_norm": 0.3540440729162601,
+      "learning_rate": 0.0001693958505117314,
+      "loss": 0.648,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.33210994154598633,
+      "learning_rate": 0.00016935438017887772,
+      "loss": 0.604,
+      "step": 1566
+    },
+    {
+      "epoch": 0.27857777777777776,
+      "grad_norm": 0.4525180092565884,
+      "learning_rate": 0.00016931288685147455,
+      "loss": 0.6771,
+      "step": 1567
+    },
+    {
+      "epoch": 0.27875555555555553,
+      "grad_norm": 0.3596065489704551,
+      "learning_rate": 0.00016927137054327908,
+      "loss": 0.6327,
+      "step": 1568
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.37794117734583466,
+      "learning_rate": 0.00016922983126805614,
+      "loss": 0.6861,
+      "step": 1569
+    },
+    {
+      "epoch": 0.2791111111111111,
+      "grad_norm": 0.38868813944137415,
+      "learning_rate": 0.0001691882690395781,
+      "loss": 0.6819,
+      "step": 1570
+    },
+    {
+      "epoch": 0.27928888888888886,
+      "grad_norm": 0.38780003968249405,
+      "learning_rate": 0.00016914668387162497,
+      "loss": 0.6881,
+      "step": 1571
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.37979083416409737,
+      "learning_rate": 0.00016910507577798443,
+      "loss": 0.6783,
+      "step": 1572
+    },
+    {
+      "epoch": 0.2796444444444444,
+      "grad_norm": 0.34529841350312207,
+      "learning_rate": 0.00016906344477245165,
+      "loss": 0.6259,
+      "step": 1573
+    },
+    {
+      "epoch": 0.27982222222222225,
+      "grad_norm": 0.3592364311448176,
+      "learning_rate": 0.00016902179086882948,
+      "loss": 0.6632,
+      "step": 1574
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.36716844209260896,
+      "learning_rate": 0.00016898011408092832,
+      "loss": 0.649,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2801777777777778,
+      "grad_norm": 0.36409500573185516,
+      "learning_rate": 0.00016893841442256618,
+      "loss": 0.6785,
+      "step": 1576
+    },
+    {
+      "epoch": 0.2803555555555556,
+      "grad_norm": 0.3582229703727218,
+      "learning_rate": 0.00016889669190756868,
+      "loss": 0.6174,
+      "step": 1577
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.3573340906124708,
+      "learning_rate": 0.0001688549465497689,
+      "loss": 0.6133,
+      "step": 1578
+    },
+    {
+      "epoch": 0.28071111111111113,
+      "grad_norm": 0.36536457632216834,
+      "learning_rate": 0.00016881317836300766,
+      "loss": 0.6649,
+      "step": 1579
+    },
+    {
+      "epoch": 0.2808888888888889,
+      "grad_norm": 0.3655700899178259,
+      "learning_rate": 0.00016877138736113323,
+      "loss": 0.6306,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.3511758005548785,
+      "learning_rate": 0.00016872957355800144,
+      "loss": 0.6881,
+      "step": 1581
+    },
+    {
+      "epoch": 0.28124444444444446,
+      "grad_norm": 0.36451707401360406,
+      "learning_rate": 0.0001686877369674758,
+      "loss": 0.6748,
+      "step": 1582
+    },
+    {
+      "epoch": 0.28142222222222224,
+      "grad_norm": 0.36792045896596826,
+      "learning_rate": 0.00016864587760342725,
+      "loss": 0.6641,
+      "step": 1583
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.40129163977490706,
+      "learning_rate": 0.00016860399547973431,
+      "loss": 0.6976,
+      "step": 1584
+    },
+    {
+      "epoch": 0.2817777777777778,
+      "grad_norm": 0.36823163820797866,
+      "learning_rate": 0.0001685620906102831,
+      "loss": 0.6791,
+      "step": 1585
+    },
+    {
+      "epoch": 0.28195555555555557,
+      "grad_norm": 0.37108838575009506,
+      "learning_rate": 0.0001685201630089672,
+      "loss": 0.6129,
+      "step": 1586
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.3738833425629442,
+      "learning_rate": 0.00016847821268968784,
+      "loss": 0.6854,
+      "step": 1587
+    },
+    {
+      "epoch": 0.2823111111111111,
+      "grad_norm": 0.3630864417219976,
+      "learning_rate": 0.00016843623966635366,
+      "loss": 0.6151,
+      "step": 1588
+    },
+    {
+      "epoch": 0.2824888888888889,
+      "grad_norm": 0.37306805915697583,
+      "learning_rate": 0.00016839424395288083,
+      "loss": 0.6859,
+      "step": 1589
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.36990542494596307,
+      "learning_rate": 0.00016835222556319315,
+      "loss": 0.6525,
+      "step": 1590
+    },
+    {
+      "epoch": 0.28284444444444445,
+      "grad_norm": 0.39720331269463327,
+      "learning_rate": 0.00016831018451122194,
+      "loss": 0.614,
+      "step": 1591
+    },
+    {
+      "epoch": 0.28302222222222223,
+      "grad_norm": 0.36559407445369607,
+      "learning_rate": 0.00016826812081090586,
+      "loss": 0.6612,
+      "step": 1592
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4101962332697649,
+      "learning_rate": 0.00016822603447619127,
+      "loss": 0.7124,
+      "step": 1593
+    },
+    {
+      "epoch": 0.2833777777777778,
+      "grad_norm": 0.36202439944229253,
+      "learning_rate": 0.00016818392552103194,
+      "loss": 0.6556,
+      "step": 1594
+    },
+    {
+      "epoch": 0.28355555555555556,
+      "grad_norm": 0.36569188513336376,
+      "learning_rate": 0.00016814179395938913,
+      "loss": 0.6506,
+      "step": 1595
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.3543236669712553,
+      "learning_rate": 0.00016809963980523164,
+      "loss": 0.593,
+      "step": 1596
+    },
+    {
+      "epoch": 0.2839111111111111,
+      "grad_norm": 0.35247266506067887,
+      "learning_rate": 0.00016805746307253574,
+      "loss": 0.6726,
+      "step": 1597
+    },
+    {
+      "epoch": 0.2840888888888889,
+      "grad_norm": 0.3740907670356731,
+      "learning_rate": 0.00016801526377528523,
+      "loss": 0.7119,
+      "step": 1598
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.3987839060348559,
+      "learning_rate": 0.0001679730419274713,
+      "loss": 0.6501,
+      "step": 1599
+    },
+    {
+      "epoch": 0.28444444444444444,
+      "grad_norm": 0.40372348789282286,
+      "learning_rate": 0.00016793079754309268,
+      "loss": 0.713,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2846222222222222,
+      "grad_norm": 0.36237509073160673,
+      "learning_rate": 0.00016788853063615556,
+      "loss": 0.6107,
+      "step": 1601
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.38266541140634475,
+      "learning_rate": 0.0001678462412206736,
+      "loss": 0.6672,
+      "step": 1602
+    },
+    {
+      "epoch": 0.2849777777777778,
+      "grad_norm": 0.35249444514464395,
+      "learning_rate": 0.00016780392931066792,
+      "loss": 0.6315,
+      "step": 1603
+    },
+    {
+      "epoch": 0.28515555555555555,
+      "grad_norm": 0.36639923282166315,
+      "learning_rate": 0.0001677615949201671,
+      "loss": 0.668,
+      "step": 1604
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.3454093182608668,
+      "learning_rate": 0.0001677192380632072,
+      "loss": 0.6568,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2855111111111111,
+      "grad_norm": 0.3593673595788356,
+      "learning_rate": 0.00016767685875383162,
+      "loss": 0.6466,
+      "step": 1606
+    },
+    {
+      "epoch": 0.2856888888888889,
+      "grad_norm": 0.35355527121251307,
+      "learning_rate": 0.00016763445700609134,
+      "loss": 0.6494,
+      "step": 1607
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.367848583122886,
+      "learning_rate": 0.00016759203283404475,
+      "loss": 0.6844,
+      "step": 1608
+    },
+    {
+      "epoch": 0.28604444444444443,
+      "grad_norm": 0.36825423348572783,
+      "learning_rate": 0.00016754958625175758,
+      "loss": 0.612,
+      "step": 1609
+    },
+    {
+      "epoch": 0.2862222222222222,
+      "grad_norm": 0.37562533596910713,
+      "learning_rate": 0.0001675071172733031,
+      "loss": 0.7046,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.352316300311903,
+      "learning_rate": 0.000167464625912762,
+      "loss": 0.6341,
+      "step": 1611
+    },
+    {
+      "epoch": 0.28657777777777776,
+      "grad_norm": 0.36726691871719863,
+      "learning_rate": 0.00016742211218422225,
+      "loss": 0.6598,
+      "step": 1612
+    },
+    {
+      "epoch": 0.28675555555555554,
+      "grad_norm": 0.37202547875973163,
+      "learning_rate": 0.00016737957610177942,
+      "loss": 0.6544,
+      "step": 1613
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.39590029044820857,
+      "learning_rate": 0.0001673370176795364,
+      "loss": 0.6456,
+      "step": 1614
+    },
+    {
+      "epoch": 0.2871111111111111,
+      "grad_norm": 0.33959767179359196,
+      "learning_rate": 0.0001672944369316035,
+      "loss": 0.569,
+      "step": 1615
+    },
+    {
+      "epoch": 0.28728888888888887,
+      "grad_norm": 0.3844339484101618,
+      "learning_rate": 0.00016725183387209845,
+      "loss": 0.643,
+      "step": 1616
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.40312131507951543,
+      "learning_rate": 0.0001672092085151463,
+      "loss": 0.6893,
+      "step": 1617
+    },
+    {
+      "epoch": 0.2876444444444444,
+      "grad_norm": 0.38037957999982946,
+      "learning_rate": 0.00016716656087487959,
+      "loss": 0.6253,
+      "step": 1618
+    },
+    {
+      "epoch": 0.2878222222222222,
+      "grad_norm": 0.38918651766949613,
+      "learning_rate": 0.00016712389096543818,
+      "loss": 0.7049,
+      "step": 1619
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3903890800343581,
+      "learning_rate": 0.00016708119880096942,
+      "loss": 0.6865,
+      "step": 1620
+    },
+    {
+      "epoch": 0.28817777777777775,
+      "grad_norm": 0.40719651196996154,
+      "learning_rate": 0.00016703848439562785,
+      "loss": 0.6355,
+      "step": 1621
+    },
+    {
+      "epoch": 0.28835555555555553,
+      "grad_norm": 0.38801940260098977,
+      "learning_rate": 0.0001669957477635756,
+      "loss": 0.6919,
+      "step": 1622
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.39761241812893516,
+      "learning_rate": 0.00016695298891898202,
+      "loss": 0.6334,
+      "step": 1623
+    },
+    {
+      "epoch": 0.2887111111111111,
+      "grad_norm": 0.36975082129572434,
+      "learning_rate": 0.00016691020787602386,
+      "loss": 0.6486,
+      "step": 1624
+    },
+    {
+      "epoch": 0.28888888888888886,
+      "grad_norm": 0.36544262517956666,
+      "learning_rate": 0.00016686740464888521,
+      "loss": 0.6418,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.3741479104412605,
+      "learning_rate": 0.00016682457925175763,
+      "loss": 0.6447,
+      "step": 1626
+    },
+    {
+      "epoch": 0.28924444444444447,
+      "grad_norm": 0.36114846927211625,
+      "learning_rate": 0.0001667817316988399,
+      "loss": 0.659,
+      "step": 1627
+    },
+    {
+      "epoch": 0.28942222222222225,
+      "grad_norm": 0.37757858853180404,
+      "learning_rate": 0.00016673886200433818,
+      "loss": 0.6637,
+      "step": 1628
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.3790427429504027,
+      "learning_rate": 0.00016669597018246598,
+      "loss": 0.6589,
+      "step": 1629
+    },
+    {
+      "epoch": 0.2897777777777778,
+      "grad_norm": 0.3698428846175222,
+      "learning_rate": 0.00016665305624744415,
+      "loss": 0.664,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2899555555555556,
+      "grad_norm": 0.35903265930230455,
+      "learning_rate": 0.00016661012021350092,
+      "loss": 0.6271,
+      "step": 1631
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.3991257430531067,
+      "learning_rate": 0.00016656716209487174,
+      "loss": 0.6621,
+      "step": 1632
+    },
+    {
+      "epoch": 0.29031111111111113,
+      "grad_norm": 0.35614584184602,
+      "learning_rate": 0.00016652418190579943,
+      "loss": 0.6283,
+      "step": 1633
+    },
+    {
+      "epoch": 0.2904888888888889,
+      "grad_norm": 0.35609420898718935,
+      "learning_rate": 0.00016648117966053418,
+      "loss": 0.6271,
+      "step": 1634
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.40418988531579475,
+      "learning_rate": 0.00016643815537333346,
+      "loss": 0.7326,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29084444444444446,
+      "grad_norm": 0.3769344314020853,
+      "learning_rate": 0.00016639510905846195,
+      "loss": 0.6557,
+      "step": 1636
+    },
+    {
+      "epoch": 0.29102222222222224,
+      "grad_norm": 0.398178854541475,
+      "learning_rate": 0.00016635204073019183,
+      "loss": 0.7394,
+      "step": 1637
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.362286540124684,
+      "learning_rate": 0.00016630895040280238,
+      "loss": 0.678,
+      "step": 1638
+    },
+    {
+      "epoch": 0.2913777777777778,
+      "grad_norm": 0.35741926867254975,
+      "learning_rate": 0.00016626583809058033,
+      "loss": 0.6789,
+      "step": 1639
+    },
+    {
+      "epoch": 0.29155555555555557,
+      "grad_norm": 0.38382435380886026,
+      "learning_rate": 0.00016622270380781958,
+      "loss": 0.6815,
+      "step": 1640
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.3591472386834611,
+      "learning_rate": 0.00016617954756882144,
+      "loss": 0.6197,
+      "step": 1641
+    },
+    {
+      "epoch": 0.2919111111111111,
+      "grad_norm": 0.3830388642691941,
+      "learning_rate": 0.00016613636938789435,
+      "loss": 0.677,
+      "step": 1642
+    },
+    {
+      "epoch": 0.2920888888888889,
+      "grad_norm": 0.3582069695503815,
+      "learning_rate": 0.0001660931692793541,
+      "loss": 0.634,
+      "step": 1643
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.36018299074696664,
+      "learning_rate": 0.00016604994725752379,
+      "loss": 0.6863,
+      "step": 1644
+    },
+    {
+      "epoch": 0.29244444444444445,
+      "grad_norm": 0.3615518963205158,
+      "learning_rate": 0.00016600670333673375,
+      "loss": 0.6196,
+      "step": 1645
+    },
+    {
+      "epoch": 0.29262222222222223,
+      "grad_norm": 0.3503360141879489,
+      "learning_rate": 0.00016596343753132154,
+      "loss": 0.6427,
+      "step": 1646
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4182386117411707,
+      "learning_rate": 0.000165920149855632,
+      "loss": 0.6454,
+      "step": 1647
+    },
+    {
+      "epoch": 0.2929777777777778,
+      "grad_norm": 0.3717216798529551,
+      "learning_rate": 0.00016587684032401732,
+      "loss": 0.6742,
+      "step": 1648
+    },
+    {
+      "epoch": 0.29315555555555556,
+      "grad_norm": 0.3719433656275183,
+      "learning_rate": 0.00016583350895083666,
+      "loss": 0.6499,
+      "step": 1649
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.38615751025353456,
+      "learning_rate": 0.00016579015575045677,
+      "loss": 0.6471,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2935111111111111,
+      "grad_norm": 0.3715801680741638,
+      "learning_rate": 0.0001657467807372514,
+      "loss": 0.6622,
+      "step": 1651
+    },
+    {
+      "epoch": 0.2936888888888889,
+      "grad_norm": 0.34774148937984606,
+      "learning_rate": 0.00016570338392560155,
+      "loss": 0.6689,
+      "step": 1652
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.36754532374650367,
+      "learning_rate": 0.0001656599653298956,
+      "loss": 0.6772,
+      "step": 1653
+    },
+    {
+      "epoch": 0.29404444444444444,
+      "grad_norm": 0.3650260431663795,
+      "learning_rate": 0.000165616524964529,
+      "loss": 0.6497,
+      "step": 1654
+    },
+    {
+      "epoch": 0.2942222222222222,
+      "grad_norm": 0.35679064413053624,
+      "learning_rate": 0.00016557306284390445,
+      "loss": 0.6605,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3707613764546627,
+      "learning_rate": 0.0001655295789824319,
+      "loss": 0.7071,
+      "step": 1656
+    },
+    {
+      "epoch": 0.29457777777777777,
+      "grad_norm": 0.3639229761322604,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.6396,
+      "step": 1657
+    },
+    {
+      "epoch": 0.29475555555555555,
+      "grad_norm": 0.38098051320159065,
+      "learning_rate": 0.0001654425460946186,
+      "loss": 0.6301,
+      "step": 1658
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.37483174017176313,
+      "learning_rate": 0.00016539899709713373,
+      "loss": 0.6693,
+      "step": 1659
+    },
+    {
+      "epoch": 0.2951111111111111,
+      "grad_norm": 0.3614679507704923,
+      "learning_rate": 0.00016535542641651262,
+      "loss": 0.6871,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2952888888888889,
+      "grad_norm": 0.3853837325412522,
+      "learning_rate": 0.0001653118340672012,
+      "loss": 0.6617,
+      "step": 1661
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.36349561612140896,
+      "learning_rate": 0.00016526822006365257,
+      "loss": 0.6277,
+      "step": 1662
+    },
+    {
+      "epoch": 0.29564444444444443,
+      "grad_norm": 0.3546970965262031,
+      "learning_rate": 0.00016522458442032702,
+      "loss": 0.6257,
+      "step": 1663
+    },
+    {
+      "epoch": 0.2958222222222222,
+      "grad_norm": 0.3932794704259257,
+      "learning_rate": 0.00016518092715169202,
+      "loss": 0.6413,
+      "step": 1664
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.3804945926184189,
+      "learning_rate": 0.00016513724827222227,
+      "loss": 0.6533,
+      "step": 1665
+    },
+    {
+      "epoch": 0.29617777777777776,
+      "grad_norm": 0.39375658232461064,
+      "learning_rate": 0.00016509354779639944,
+      "loss": 0.6478,
+      "step": 1666
+    },
+    {
+      "epoch": 0.29635555555555554,
+      "grad_norm": 0.35285292293367065,
+      "learning_rate": 0.00016504982573871253,
+      "loss": 0.6374,
+      "step": 1667
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.3405577410350097,
+      "learning_rate": 0.0001650060821136577,
+      "loss": 0.6367,
+      "step": 1668
+    },
+    {
+      "epoch": 0.2967111111111111,
+      "grad_norm": 0.34877181340632607,
+      "learning_rate": 0.0001649623169357382,
+      "loss": 0.6531,
+      "step": 1669
+    },
+    {
+      "epoch": 0.29688888888888887,
+      "grad_norm": 0.34941513955424286,
+      "learning_rate": 0.00016491853021946443,
+      "loss": 0.6284,
+      "step": 1670
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.38174250521150227,
+      "learning_rate": 0.00016487472197935393,
+      "loss": 0.7331,
+      "step": 1671
+    },
+    {
+      "epoch": 0.2972444444444444,
+      "grad_norm": 0.359302494372468,
+      "learning_rate": 0.0001648308922299314,
+      "loss": 0.6425,
+      "step": 1672
+    },
+    {
+      "epoch": 0.2974222222222222,
+      "grad_norm": 0.34153908692702345,
+      "learning_rate": 0.0001647870409857287,
+      "loss": 0.6198,
+      "step": 1673
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3575729020900639,
+      "learning_rate": 0.0001647431682612847,
+      "loss": 0.7009,
+      "step": 1674
+    },
+    {
+      "epoch": 0.29777777777777775,
+      "grad_norm": 0.3556820727438643,
+      "learning_rate": 0.0001646992740711455,
+      "loss": 0.5996,
+      "step": 1675
+    },
+    {
+      "epoch": 0.29795555555555553,
+      "grad_norm": 0.3842106915170282,
+      "learning_rate": 0.00016465535842986434,
+      "loss": 0.6785,
+      "step": 1676
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.37541548993651636,
+      "learning_rate": 0.0001646114213520014,
+      "loss": 0.6125,
+      "step": 1677
+    },
+    {
+      "epoch": 0.29831111111111114,
+      "grad_norm": 0.4604844744115064,
+      "learning_rate": 0.00016456746285212418,
+      "loss": 0.7025,
+      "step": 1678
+    },
+    {
+      "epoch": 0.2984888888888889,
+      "grad_norm": 0.3798955846806804,
+      "learning_rate": 0.00016452348294480716,
+      "loss": 0.6338,
+      "step": 1679
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.3685450257934339,
+      "learning_rate": 0.00016447948164463196,
+      "loss": 0.6488,
+      "step": 1680
+    },
+    {
+      "epoch": 0.29884444444444447,
+      "grad_norm": 0.3775056589359029,
+      "learning_rate": 0.00016443545896618723,
+      "loss": 0.6383,
+      "step": 1681
+    },
+    {
+      "epoch": 0.29902222222222224,
+      "grad_norm": 0.3771388170336481,
+      "learning_rate": 0.0001643914149240688,
+      "loss": 0.6656,
+      "step": 1682
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.36586734107527763,
+      "learning_rate": 0.00016434734953287955,
+      "loss": 0.6576,
+      "step": 1683
+    },
+    {
+      "epoch": 0.2993777777777778,
+      "grad_norm": 0.3776476015047614,
+      "learning_rate": 0.00016430326280722935,
+      "loss": 0.626,
+      "step": 1684
+    },
+    {
+      "epoch": 0.2995555555555556,
+      "grad_norm": 0.356129705155627,
+      "learning_rate": 0.00016425915476173532,
+      "loss": 0.6196,
+      "step": 1685
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.36271314440779684,
+      "learning_rate": 0.00016421502541102148,
+      "loss": 0.6887,
+      "step": 1686
+    },
+    {
+      "epoch": 0.29991111111111113,
+      "grad_norm": 0.36242908957370806,
+      "learning_rate": 0.000164170874769719,
+      "loss": 0.6398,
+      "step": 1687
+    },
+    {
+      "epoch": 0.3000888888888889,
+      "grad_norm": 0.36457516674878454,
+      "learning_rate": 0.0001641267028524661,
+      "loss": 0.6086,
+      "step": 1688
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.4072474505512138,
+      "learning_rate": 0.00016408250967390805,
+      "loss": 0.7279,
+      "step": 1689
+    },
+    {
+      "epoch": 0.30044444444444446,
+      "grad_norm": 0.3647595393407218,
+      "learning_rate": 0.00016403829524869719,
+      "loss": 0.6395,
+      "step": 1690
+    },
+    {
+      "epoch": 0.30062222222222224,
+      "grad_norm": 0.3468439266721444,
+      "learning_rate": 0.00016399405959149278,
+      "loss": 0.6153,
+      "step": 1691
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3505758726562104,
+      "learning_rate": 0.00016394980271696133,
+      "loss": 0.6533,
+      "step": 1692
+    },
+    {
+      "epoch": 0.3009777777777778,
+      "grad_norm": 0.36954713137927453,
+      "learning_rate": 0.00016390552463977623,
+      "loss": 0.6853,
+      "step": 1693
+    },
+    {
+      "epoch": 0.30115555555555557,
+      "grad_norm": 0.4086642593848733,
+      "learning_rate": 0.0001638612253746179,
+      "loss": 0.5944,
+      "step": 1694
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.3728852684537193,
+      "learning_rate": 0.00016381690493617393,
+      "loss": 0.5958,
+      "step": 1695
+    },
+    {
+      "epoch": 0.3015111111111111,
+      "grad_norm": 0.37227237329823,
+      "learning_rate": 0.0001637725633391387,
+      "loss": 0.6364,
+      "step": 1696
+    },
+    {
+      "epoch": 0.3016888888888889,
+      "grad_norm": 0.3515896502389464,
+      "learning_rate": 0.00016372820059821388,
+      "loss": 0.6083,
+      "step": 1697
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.35558746860390805,
+      "learning_rate": 0.00016368381672810786,
+      "loss": 0.5973,
+      "step": 1698
+    },
+    {
+      "epoch": 0.30204444444444445,
+      "grad_norm": 0.39096283723134945,
+      "learning_rate": 0.00016363941174353628,
+      "loss": 0.697,
+      "step": 1699
+    },
+    {
+      "epoch": 0.3022222222222222,
+      "grad_norm": 0.3761212982411811,
+      "learning_rate": 0.00016359498565922165,
+      "loss": 0.6841,
+      "step": 1700
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.41384762557019056,
+      "learning_rate": 0.00016355053848989348,
+      "loss": 0.6274,
+      "step": 1701
+    },
+    {
+      "epoch": 0.3025777777777778,
+      "grad_norm": 0.3875701235326229,
+      "learning_rate": 0.00016350607025028834,
+      "loss": 0.6507,
+      "step": 1702
+    },
+    {
+      "epoch": 0.30275555555555556,
+      "grad_norm": 0.3523967540916701,
+      "learning_rate": 0.00016346158095514968,
+      "loss": 0.6093,
+      "step": 1703
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.37237022080565885,
+      "learning_rate": 0.00016341707061922803,
+      "loss": 0.6641,
+      "step": 1704
+    },
+    {
+      "epoch": 0.3031111111111111,
+      "grad_norm": 0.38752436104983895,
+      "learning_rate": 0.0001633725392572809,
+      "loss": 0.696,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3032888888888889,
+      "grad_norm": 0.3644395139666274,
+      "learning_rate": 0.0001633279868840727,
+      "loss": 0.6243,
+      "step": 1706
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.3587656545832138,
+      "learning_rate": 0.00016328341351437478,
+      "loss": 0.6723,
+      "step": 1707
+    },
+    {
+      "epoch": 0.30364444444444444,
+      "grad_norm": 0.3805289941564587,
+      "learning_rate": 0.0001632388191629656,
+      "loss": 0.6629,
+      "step": 1708
+    },
+    {
+      "epoch": 0.3038222222222222,
+      "grad_norm": 0.3619187332038642,
+      "learning_rate": 0.0001631942038446304,
+      "loss": 0.6295,
+      "step": 1709
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.39152956006743905,
+      "learning_rate": 0.00016314956757416154,
+      "loss": 0.6679,
+      "step": 1710
+    },
+    {
+      "epoch": 0.30417777777777777,
+      "grad_norm": 0.38136815429344956,
+      "learning_rate": 0.00016310491036635816,
+      "loss": 0.6561,
+      "step": 1711
+    },
+    {
+      "epoch": 0.30435555555555555,
+      "grad_norm": 0.3555085182286857,
+      "learning_rate": 0.0001630602322360265,
+      "loss": 0.6088,
+      "step": 1712
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.38749766289318294,
+      "learning_rate": 0.0001630155331979796,
+      "loss": 0.6496,
+      "step": 1713
+    },
+    {
+      "epoch": 0.3047111111111111,
+      "grad_norm": 0.37253326298486855,
+      "learning_rate": 0.0001629708132670375,
+      "loss": 0.6401,
+      "step": 1714
+    },
+    {
+      "epoch": 0.3048888888888889,
+      "grad_norm": 0.3604371225636213,
+      "learning_rate": 0.0001629260724580272,
+      "loss": 0.6479,
+      "step": 1715
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.36211094683281686,
+      "learning_rate": 0.00016288131078578258,
+      "loss": 0.671,
+      "step": 1716
+    },
+    {
+      "epoch": 0.30524444444444443,
+      "grad_norm": 0.3535167380697168,
+      "learning_rate": 0.0001628365282651444,
+      "loss": 0.6389,
+      "step": 1717
+    },
+    {
+      "epoch": 0.3054222222222222,
+      "grad_norm": 0.3674531278583611,
+      "learning_rate": 0.0001627917249109604,
+      "loss": 0.6864,
+      "step": 1718
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.36792805833937736,
+      "learning_rate": 0.0001627469007380852,
+      "loss": 0.6756,
+      "step": 1719
+    },
+    {
+      "epoch": 0.30577777777777776,
+      "grad_norm": 0.33062236265772743,
+      "learning_rate": 0.00016270205576138032,
+      "loss": 0.6255,
+      "step": 1720
+    },
+    {
+      "epoch": 0.30595555555555554,
+      "grad_norm": 0.36465488896191917,
+      "learning_rate": 0.00016265718999571415,
+      "loss": 0.6481,
+      "step": 1721
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.37826787882977797,
+      "learning_rate": 0.00016261230345596207,
+      "loss": 0.633,
+      "step": 1722
+    },
+    {
+      "epoch": 0.3063111111111111,
+      "grad_norm": 0.3764301507046969,
+      "learning_rate": 0.00016256739615700622,
+      "loss": 0.6795,
+      "step": 1723
+    },
+    {
+      "epoch": 0.30648888888888887,
+      "grad_norm": 0.36503316126263086,
+      "learning_rate": 0.0001625224681137357,
+      "loss": 0.65,
+      "step": 1724
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.37363993230389003,
+      "learning_rate": 0.00016247751934104647,
+      "loss": 0.6044,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3068444444444444,
+      "grad_norm": 0.38272736827927206,
+      "learning_rate": 0.00016243254985384137,
+      "loss": 0.702,
+      "step": 1726
+    },
+    {
+      "epoch": 0.3070222222222222,
+      "grad_norm": 0.36838419648838244,
+      "learning_rate": 0.0001623875596670301,
+      "loss": 0.6142,
+      "step": 1727
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3946411710598122,
+      "learning_rate": 0.0001623425487955292,
+      "loss": 0.6539,
+      "step": 1728
+    },
+    {
+      "epoch": 0.3073777777777778,
+      "grad_norm": 0.37169451135801346,
+      "learning_rate": 0.00016229751725426212,
+      "loss": 0.6668,
+      "step": 1729
+    },
+    {
+      "epoch": 0.3075555555555556,
+      "grad_norm": 0.37064746387431535,
+      "learning_rate": 0.00016225246505815916,
+      "loss": 0.6577,
+      "step": 1730
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.38655703582976847,
+      "learning_rate": 0.00016220739222215738,
+      "loss": 0.6644,
+      "step": 1731
+    },
+    {
+      "epoch": 0.30791111111111114,
+      "grad_norm": 0.36351047913657664,
+      "learning_rate": 0.0001621622987612008,
+      "loss": 0.601,
+      "step": 1732
+    },
+    {
+      "epoch": 0.3080888888888889,
+      "grad_norm": 0.36850199094876956,
+      "learning_rate": 0.00016211718469024019,
+      "loss": 0.6966,
+      "step": 1733
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.390154892595685,
+      "learning_rate": 0.0001620720500242332,
+      "loss": 0.6786,
+      "step": 1734
+    },
+    {
+      "epoch": 0.30844444444444447,
+      "grad_norm": 0.35824182930425935,
+      "learning_rate": 0.0001620268947781443,
+      "loss": 0.6472,
+      "step": 1735
+    },
+    {
+      "epoch": 0.30862222222222224,
+      "grad_norm": 0.35450289701538007,
+      "learning_rate": 0.0001619817189669448,
+      "loss": 0.6484,
+      "step": 1736
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.36600383412112,
+      "learning_rate": 0.00016193652260561279,
+      "loss": 0.6959,
+      "step": 1737
+    },
+    {
+      "epoch": 0.3089777777777778,
+      "grad_norm": 0.388772885514875,
+      "learning_rate": 0.0001618913057091332,
+      "loss": 0.6624,
+      "step": 1738
+    },
+    {
+      "epoch": 0.3091555555555556,
+      "grad_norm": 0.39064081120691746,
+      "learning_rate": 0.00016184606829249768,
+      "loss": 0.61,
+      "step": 1739
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.3542687929481353,
+      "learning_rate": 0.0001618008103707049,
+      "loss": 0.6312,
+      "step": 1740
+    },
+    {
+      "epoch": 0.3095111111111111,
+      "grad_norm": 0.3727749325508445,
+      "learning_rate": 0.0001617555319587601,
+      "loss": 0.6593,
+      "step": 1741
+    },
+    {
+      "epoch": 0.3096888888888889,
+      "grad_norm": 0.36140017898631316,
+      "learning_rate": 0.00016171023307167545,
+      "loss": 0.657,
+      "step": 1742
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.358738075807226,
+      "learning_rate": 0.00016166491372446984,
+      "loss": 0.6756,
+      "step": 1743
+    },
+    {
+      "epoch": 0.31004444444444446,
+      "grad_norm": 0.38091348380994894,
+      "learning_rate": 0.00016161957393216896,
+      "loss": 0.6673,
+      "step": 1744
+    },
+    {
+      "epoch": 0.31022222222222223,
+      "grad_norm": 0.38779010299402067,
+      "learning_rate": 0.0001615742137098053,
+      "loss": 0.6643,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.35724222431246083,
+      "learning_rate": 0.00016152883307241815,
+      "loss": 0.6564,
+      "step": 1746
+    },
+    {
+      "epoch": 0.3105777777777778,
+      "grad_norm": 0.36116431074929833,
+      "learning_rate": 0.00016148343203505346,
+      "loss": 0.7278,
+      "step": 1747
+    },
+    {
+      "epoch": 0.31075555555555556,
+      "grad_norm": 0.3629916030165536,
+      "learning_rate": 0.00016143801061276403,
+      "loss": 0.6386,
+      "step": 1748
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.395746393048325,
+      "learning_rate": 0.00016139256882060946,
+      "loss": 0.7123,
+      "step": 1749
+    },
+    {
+      "epoch": 0.3111111111111111,
+      "grad_norm": 0.3575212517637486,
+      "learning_rate": 0.00016134710667365596,
+      "loss": 0.6527,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3112888888888889,
+      "grad_norm": 0.36500223084327904,
+      "learning_rate": 0.0001613016241869766,
+      "loss": 0.6502,
+      "step": 1751
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.37427191136196,
+      "learning_rate": 0.00016125612137565123,
+      "loss": 0.7022,
+      "step": 1752
+    },
+    {
+      "epoch": 0.31164444444444445,
+      "grad_norm": 0.3493185167633601,
+      "learning_rate": 0.0001612105982547663,
+      "loss": 0.5804,
+      "step": 1753
+    },
+    {
+      "epoch": 0.3118222222222222,
+      "grad_norm": 0.37903019466021276,
+      "learning_rate": 0.00016116505483941505,
+      "loss": 0.6641,
+      "step": 1754
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.351886067019922,
+      "learning_rate": 0.0001611194911446976,
+      "loss": 0.6186,
+      "step": 1755
+    },
+    {
+      "epoch": 0.3121777777777778,
+      "grad_norm": 0.3608822364958209,
+      "learning_rate": 0.00016107390718572053,
+      "loss": 0.6128,
+      "step": 1756
+    },
+    {
+      "epoch": 0.31235555555555555,
+      "grad_norm": 0.36834705618611674,
+      "learning_rate": 0.0001610283029775973,
+      "loss": 0.6335,
+      "step": 1757
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.363195299854787,
+      "learning_rate": 0.0001609826785354481,
+      "loss": 0.6371,
+      "step": 1758
+    },
+    {
+      "epoch": 0.3127111111111111,
+      "grad_norm": 0.3639046661794866,
+      "learning_rate": 0.0001609370338743997,
+      "loss": 0.6385,
+      "step": 1759
+    },
+    {
+      "epoch": 0.3128888888888889,
+      "grad_norm": 0.37292250685136824,
+      "learning_rate": 0.00016089136900958577,
+      "loss": 0.6648,
+      "step": 1760
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.3728888359050662,
+      "learning_rate": 0.00016084568395614648,
+      "loss": 0.6558,
+      "step": 1761
+    },
+    {
+      "epoch": 0.31324444444444444,
+      "grad_norm": 0.36093429000488164,
+      "learning_rate": 0.00016079997872922878,
+      "loss": 0.6406,
+      "step": 1762
+    },
+    {
+      "epoch": 0.3134222222222222,
+      "grad_norm": 0.36252748591563455,
+      "learning_rate": 0.00016075425334398635,
+      "loss": 0.6511,
+      "step": 1763
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.37795241750301034,
+      "learning_rate": 0.00016070850781557948,
+      "loss": 0.6204,
+      "step": 1764
+    },
+    {
+      "epoch": 0.31377777777777777,
+      "grad_norm": 0.4232720202696655,
+      "learning_rate": 0.00016066274215917518,
+      "loss": 0.7007,
+      "step": 1765
+    },
+    {
+      "epoch": 0.31395555555555554,
+      "grad_norm": 0.3890812863566723,
+      "learning_rate": 0.00016061695638994715,
+      "loss": 0.6882,
+      "step": 1766
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.3816459860778058,
+      "learning_rate": 0.00016057115052307567,
+      "loss": 0.7129,
+      "step": 1767
+    },
+    {
+      "epoch": 0.3143111111111111,
+      "grad_norm": 0.5666069535202038,
+      "learning_rate": 0.00016052532457374777,
+      "loss": 0.6174,
+      "step": 1768
+    },
+    {
+      "epoch": 0.3144888888888889,
+      "grad_norm": 0.36688081306723974,
+      "learning_rate": 0.00016047947855715714,
+      "loss": 0.6964,
+      "step": 1769
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.34358146967871445,
+      "learning_rate": 0.00016043361248850406,
+      "loss": 0.5922,
+      "step": 1770
+    },
+    {
+      "epoch": 0.3148444444444444,
+      "grad_norm": 0.43275736691447925,
+      "learning_rate": 0.0001603877263829955,
+      "loss": 0.6407,
+      "step": 1771
+    },
+    {
+      "epoch": 0.3150222222222222,
+      "grad_norm": 0.3525551262689683,
+      "learning_rate": 0.0001603418202558451,
+      "loss": 0.6261,
+      "step": 1772
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3651264953192697,
+      "learning_rate": 0.00016029589412227307,
+      "loss": 0.6257,
+      "step": 1773
+    },
+    {
+      "epoch": 0.31537777777777776,
+      "grad_norm": 0.3553831704757985,
+      "learning_rate": 0.00016024994799750632,
+      "loss": 0.6228,
+      "step": 1774
+    },
+    {
+      "epoch": 0.31555555555555553,
+      "grad_norm": 0.3745489016338945,
+      "learning_rate": 0.0001602039818967783,
+      "loss": 0.6568,
+      "step": 1775
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.4166500432344376,
+      "learning_rate": 0.0001601579958353292,
+      "loss": 0.6399,
+      "step": 1776
+    },
+    {
+      "epoch": 0.3159111111111111,
+      "grad_norm": 0.38709085436032326,
+      "learning_rate": 0.00016011198982840576,
+      "loss": 0.6704,
+      "step": 1777
+    },
+    {
+      "epoch": 0.31608888888888886,
+      "grad_norm": 0.3806685510090918,
+      "learning_rate": 0.0001600659638912613,
+      "loss": 0.6962,
+      "step": 1778
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.3520549535927781,
+      "learning_rate": 0.00016001991803915583,
+      "loss": 0.6566,
+      "step": 1779
+    },
+    {
+      "epoch": 0.3164444444444444,
+      "grad_norm": 0.37084815554146033,
+      "learning_rate": 0.00015997385228735592,
+      "loss": 0.6801,
+      "step": 1780
+    },
+    {
+      "epoch": 0.31662222222222225,
+      "grad_norm": 0.36193607226819585,
+      "learning_rate": 0.0001599277666511347,
+      "loss": 0.6717,
+      "step": 1781
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.36978156836532877,
+      "learning_rate": 0.00015988166114577198,
+      "loss": 0.6537,
+      "step": 1782
+    },
+    {
+      "epoch": 0.3169777777777778,
+      "grad_norm": 0.36933082120015776,
+      "learning_rate": 0.00015983553578655408,
+      "loss": 0.6583,
+      "step": 1783
+    },
+    {
+      "epoch": 0.3171555555555556,
+      "grad_norm": 0.3610825036347129,
+      "learning_rate": 0.00015978939058877394,
+      "loss": 0.6443,
+      "step": 1784
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.3716920478229916,
+      "learning_rate": 0.00015974322556773108,
+      "loss": 0.6619,
+      "step": 1785
+    },
+    {
+      "epoch": 0.31751111111111113,
+      "grad_norm": 0.39740997112701565,
+      "learning_rate": 0.00015969704073873157,
+      "loss": 0.6356,
+      "step": 1786
+    },
+    {
+      "epoch": 0.3176888888888889,
+      "grad_norm": 0.3853483598971617,
+      "learning_rate": 0.00015965083611708809,
+      "loss": 0.658,
+      "step": 1787
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.35427312769562114,
+      "learning_rate": 0.00015960461171811977,
+      "loss": 0.6257,
+      "step": 1788
+    },
+    {
+      "epoch": 0.31804444444444446,
+      "grad_norm": 0.3721771209526191,
+      "learning_rate": 0.00015955836755715249,
+      "loss": 0.6568,
+      "step": 1789
+    },
+    {
+      "epoch": 0.31822222222222224,
+      "grad_norm": 0.3476821094871773,
+      "learning_rate": 0.0001595121036495185,
+      "loss": 0.6294,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.3795396867231499,
+      "learning_rate": 0.00015946582001055668,
+      "loss": 0.6847,
+      "step": 1791
+    },
+    {
+      "epoch": 0.3185777777777778,
+      "grad_norm": 0.36512089149574023,
+      "learning_rate": 0.00015941951665561244,
+      "loss": 0.6763,
+      "step": 1792
+    },
+    {
+      "epoch": 0.31875555555555557,
+      "grad_norm": 0.4040724874737144,
+      "learning_rate": 0.00015937319360003773,
+      "loss": 0.6906,
+      "step": 1793
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.3762924143149502,
+      "learning_rate": 0.00015932685085919105,
+      "loss": 0.7092,
+      "step": 1794
+    },
+    {
+      "epoch": 0.3191111111111111,
+      "grad_norm": 0.3603653277849108,
+      "learning_rate": 0.00015928048844843738,
+      "loss": 0.6756,
+      "step": 1795
+    },
+    {
+      "epoch": 0.3192888888888889,
+      "grad_norm": 0.37668503947766757,
+      "learning_rate": 0.00015923410638314826,
+      "loss": 0.7037,
+      "step": 1796
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.4088187612134446,
+      "learning_rate": 0.0001591877046787017,
+      "loss": 0.7297,
+      "step": 1797
+    },
+    {
+      "epoch": 0.31964444444444445,
+      "grad_norm": 0.34933068244720505,
+      "learning_rate": 0.00015914128335048236,
+      "loss": 0.6241,
+      "step": 1798
+    },
+    {
+      "epoch": 0.31982222222222223,
+      "grad_norm": 0.4183559140427564,
+      "learning_rate": 0.00015909484241388117,
+      "loss": 0.6778,
+      "step": 1799
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6017798847895347,
+      "learning_rate": 0.00015904838188429574,
+      "loss": 0.6494,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3201777777777778,
+      "grad_norm": 0.37336083054040586,
+      "learning_rate": 0.00015900190177713016,
+      "loss": 0.5924,
+      "step": 1801
+    },
+    {
+      "epoch": 0.32035555555555556,
+      "grad_norm": 0.4413415636910764,
+      "learning_rate": 0.00015895540210779494,
+      "loss": 0.6839,
+      "step": 1802
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.3733855261534981,
+      "learning_rate": 0.00015890888289170712,
+      "loss": 0.658,
+      "step": 1803
+    },
+    {
+      "epoch": 0.3207111111111111,
+      "grad_norm": 0.36384878107670715,
+      "learning_rate": 0.00015886234414429028,
+      "loss": 0.6882,
+      "step": 1804
+    },
+    {
+      "epoch": 0.3208888888888889,
+      "grad_norm": 0.37360750715798086,
+      "learning_rate": 0.00015881578588097431,
+      "loss": 0.595,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.3974141428210875,
+      "learning_rate": 0.00015876920811719577,
+      "loss": 0.6385,
+      "step": 1806
+    },
+    {
+      "epoch": 0.32124444444444444,
+      "grad_norm": 0.40345037870858486,
+      "learning_rate": 0.0001587226108683975,
+      "loss": 0.7248,
+      "step": 1807
+    },
+    {
+      "epoch": 0.3214222222222222,
+      "grad_norm": 0.3505421880151765,
+      "learning_rate": 0.00015867599415002895,
+      "loss": 0.6414,
+      "step": 1808
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4125416949844472,
+      "learning_rate": 0.00015862935797754594,
+      "loss": 0.6434,
+      "step": 1809
+    },
+    {
+      "epoch": 0.3217777777777778,
+      "grad_norm": 0.37937903549654906,
+      "learning_rate": 0.00015858270236641077,
+      "loss": 0.6631,
+      "step": 1810
+    },
+    {
+      "epoch": 0.32195555555555555,
+      "grad_norm": 0.37299629316427885,
+      "learning_rate": 0.00015853602733209216,
+      "loss": 0.6556,
+      "step": 1811
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.3718028224852353,
+      "learning_rate": 0.0001584893328900653,
+      "loss": 0.6679,
+      "step": 1812
+    },
+    {
+      "epoch": 0.3223111111111111,
+      "grad_norm": 0.4010053897187422,
+      "learning_rate": 0.00015844261905581183,
+      "loss": 0.6169,
+      "step": 1813
+    },
+    {
+      "epoch": 0.3224888888888889,
+      "grad_norm": 0.3735438093247827,
+      "learning_rate": 0.00015839588584481976,
+      "loss": 0.6296,
+      "step": 1814
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.3677969231783713,
+      "learning_rate": 0.00015834913327258355,
+      "loss": 0.6525,
+      "step": 1815
+    },
+    {
+      "epoch": 0.32284444444444443,
+      "grad_norm": 0.3779629852696358,
+      "learning_rate": 0.0001583023613546041,
+      "loss": 0.6608,
+      "step": 1816
+    },
+    {
+      "epoch": 0.3230222222222222,
+      "grad_norm": 0.3733931946361051,
+      "learning_rate": 0.00015825557010638871,
+      "loss": 0.6304,
+      "step": 1817
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.35646577004806546,
+      "learning_rate": 0.00015820875954345112,
+      "loss": 0.6737,
+      "step": 1818
+    },
+    {
+      "epoch": 0.32337777777777776,
+      "grad_norm": 0.39482963686923433,
+      "learning_rate": 0.00015816192968131138,
+      "loss": 0.6465,
+      "step": 1819
+    },
+    {
+      "epoch": 0.32355555555555554,
+      "grad_norm": 0.37743364160341825,
+      "learning_rate": 0.00015811508053549606,
+      "loss": 0.6772,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.36428252499879177,
+      "learning_rate": 0.000158068212121538,
+      "loss": 0.6472,
+      "step": 1821
+    },
+    {
+      "epoch": 0.3239111111111111,
+      "grad_norm": 0.3528531852682479,
+      "learning_rate": 0.00015802132445497658,
+      "loss": 0.6596,
+      "step": 1822
+    },
+    {
+      "epoch": 0.32408888888888887,
+      "grad_norm": 0.37254030182342834,
+      "learning_rate": 0.00015797441755135738,
+      "loss": 0.6748,
+      "step": 1823
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.36942472368324847,
+      "learning_rate": 0.00015792749142623253,
+      "loss": 0.6551,
+      "step": 1824
+    },
+    {
+      "epoch": 0.3244444444444444,
+      "grad_norm": 0.3392271728880086,
+      "learning_rate": 0.00015788054609516044,
+      "loss": 0.6353,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3246222222222222,
+      "grad_norm": 0.39079625838679477,
+      "learning_rate": 0.00015783358157370588,
+      "loss": 0.7011,
+      "step": 1826
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.36209126504254585,
+      "learning_rate": 0.00015778659787744,
+      "loss": 0.6629,
+      "step": 1827
+    },
+    {
+      "epoch": 0.32497777777777775,
+      "grad_norm": 0.35524844820269663,
+      "learning_rate": 0.00015773959502194039,
+      "loss": 0.6367,
+      "step": 1828
+    },
+    {
+      "epoch": 0.32515555555555553,
+      "grad_norm": 0.34709356704915956,
+      "learning_rate": 0.00015769257302279086,
+      "loss": 0.6643,
+      "step": 1829
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.36694795171343575,
+      "learning_rate": 0.0001576455318955816,
+      "loss": 0.663,
+      "step": 1830
+    },
+    {
+      "epoch": 0.3255111111111111,
+      "grad_norm": 0.3498280223812707,
+      "learning_rate": 0.0001575984716559092,
+      "loss": 0.6298,
+      "step": 1831
+    },
+    {
+      "epoch": 0.3256888888888889,
+      "grad_norm": 0.3774613788220334,
+      "learning_rate": 0.00015755139231937658,
+      "loss": 0.662,
+      "step": 1832
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.368998514643403,
+      "learning_rate": 0.00015750429390159294,
+      "loss": 0.6468,
+      "step": 1833
+    },
+    {
+      "epoch": 0.32604444444444447,
+      "grad_norm": 0.38546270014141504,
+      "learning_rate": 0.0001574571764181738,
+      "loss": 0.6975,
+      "step": 1834
+    },
+    {
+      "epoch": 0.32622222222222225,
+      "grad_norm": 0.3766520943968019,
+      "learning_rate": 0.00015741003988474107,
+      "loss": 0.6303,
+      "step": 1835
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3762880289124078,
+      "learning_rate": 0.00015736288431692294,
+      "loss": 0.6848,
+      "step": 1836
+    },
+    {
+      "epoch": 0.3265777777777778,
+      "grad_norm": 0.35878869094377924,
+      "learning_rate": 0.00015731570973035394,
+      "loss": 0.6632,
+      "step": 1837
+    },
+    {
+      "epoch": 0.3267555555555556,
+      "grad_norm": 0.3776189270689113,
+      "learning_rate": 0.00015726851614067475,
+      "loss": 0.6709,
+      "step": 1838
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.3696737850469688,
+      "learning_rate": 0.0001572213035635326,
+      "loss": 0.6274,
+      "step": 1839
+    },
+    {
+      "epoch": 0.32711111111111113,
+      "grad_norm": 0.36342965643412434,
+      "learning_rate": 0.00015717407201458087,
+      "loss": 0.6443,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3272888888888889,
+      "grad_norm": 0.3569793187005551,
+      "learning_rate": 0.00015712682150947923,
+      "loss": 0.6664,
+      "step": 1841
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.3697345076612493,
+      "learning_rate": 0.00015707955206389367,
+      "loss": 0.6619,
+      "step": 1842
+    },
+    {
+      "epoch": 0.32764444444444446,
+      "grad_norm": 0.37888208154541403,
+      "learning_rate": 0.0001570322636934964,
+      "loss": 0.6812,
+      "step": 1843
+    },
+    {
+      "epoch": 0.32782222222222224,
+      "grad_norm": 0.3827436328962468,
+      "learning_rate": 0.00015698495641396602,
+      "loss": 0.6701,
+      "step": 1844
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3502815762468765,
+      "learning_rate": 0.00015693763024098728,
+      "loss": 0.6466,
+      "step": 1845
+    },
+    {
+      "epoch": 0.3281777777777778,
+      "grad_norm": 0.3788627369656457,
+      "learning_rate": 0.00015689028519025127,
+      "loss": 0.6322,
+      "step": 1846
+    },
+    {
+      "epoch": 0.32835555555555557,
+      "grad_norm": 0.3689993966092909,
+      "learning_rate": 0.0001568429212774553,
+      "loss": 0.6136,
+      "step": 1847
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.366200099038874,
+      "learning_rate": 0.00015679553851830297,
+      "loss": 0.6321,
+      "step": 1848
+    },
+    {
+      "epoch": 0.3287111111111111,
+      "grad_norm": 0.3749624111907132,
+      "learning_rate": 0.00015674813692850408,
+      "loss": 0.6315,
+      "step": 1849
+    },
+    {
+      "epoch": 0.3288888888888889,
+      "grad_norm": 0.3485067743402259,
+      "learning_rate": 0.00015670071652377468,
+      "loss": 0.6119,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.37490919937573847,
+      "learning_rate": 0.00015665327731983713,
+      "loss": 0.6245,
+      "step": 1851
+    },
+    {
+      "epoch": 0.32924444444444445,
+      "grad_norm": 0.3767796980955667,
+      "learning_rate": 0.00015660581933241993,
+      "loss": 0.6792,
+      "step": 1852
+    },
+    {
+      "epoch": 0.3294222222222222,
+      "grad_norm": 0.39562638401911415,
+      "learning_rate": 0.00015655834257725788,
+      "loss": 0.6522,
+      "step": 1853
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3820660708180656,
+      "learning_rate": 0.00015651084707009192,
+      "loss": 0.6552,
+      "step": 1854
+    },
+    {
+      "epoch": 0.3297777777777778,
+      "grad_norm": 0.3967938699332167,
+      "learning_rate": 0.00015646333282666927,
+      "loss": 0.7019,
+      "step": 1855
+    },
+    {
+      "epoch": 0.32995555555555556,
+      "grad_norm": 0.3914165684715345,
+      "learning_rate": 0.0001564157998627434,
+      "loss": 0.6433,
+      "step": 1856
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.4681718778835079,
+      "learning_rate": 0.0001563682481940739,
+      "loss": 0.6338,
+      "step": 1857
+    },
+    {
+      "epoch": 0.3303111111111111,
+      "grad_norm": 0.3863744047782405,
+      "learning_rate": 0.00015632067783642658,
+      "loss": 0.679,
+      "step": 1858
+    },
+    {
+      "epoch": 0.3304888888888889,
+      "grad_norm": 0.36453496553135,
+      "learning_rate": 0.00015627308880557353,
+      "loss": 0.6552,
+      "step": 1859
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.36205310362923865,
+      "learning_rate": 0.00015622548111729286,
+      "loss": 0.6799,
+      "step": 1860
+    },
+    {
+      "epoch": 0.33084444444444444,
+      "grad_norm": 0.3371946679104084,
+      "learning_rate": 0.00015617785478736905,
+      "loss": 0.6309,
+      "step": 1861
+    },
+    {
+      "epoch": 0.3310222222222222,
+      "grad_norm": 0.36652767445020196,
+      "learning_rate": 0.00015613020983159265,
+      "loss": 0.6581,
+      "step": 1862
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.36055757358416896,
+      "learning_rate": 0.00015608254626576048,
+      "loss": 0.6039,
+      "step": 1863
+    },
+    {
+      "epoch": 0.33137777777777777,
+      "grad_norm": 0.3535281599186636,
+      "learning_rate": 0.00015603486410567535,
+      "loss": 0.6036,
+      "step": 1864
+    },
+    {
+      "epoch": 0.33155555555555555,
+      "grad_norm": 0.3808987736515503,
+      "learning_rate": 0.00015598716336714645,
+      "loss": 0.6435,
+      "step": 1865
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.35463981578553044,
+      "learning_rate": 0.00015593944406598896,
+      "loss": 0.6478,
+      "step": 1866
+    },
+    {
+      "epoch": 0.3319111111111111,
+      "grad_norm": 0.39314295836159147,
+      "learning_rate": 0.00015589170621802437,
+      "loss": 0.7147,
+      "step": 1867
+    },
+    {
+      "epoch": 0.3320888888888889,
+      "grad_norm": 0.3854939365192963,
+      "learning_rate": 0.00015584394983908018,
+      "loss": 0.6629,
+      "step": 1868
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.3672541368282204,
+      "learning_rate": 0.0001557961749449901,
+      "loss": 0.6558,
+      "step": 1869
+    },
+    {
+      "epoch": 0.33244444444444443,
+      "grad_norm": 0.35870691674772215,
+      "learning_rate": 0.00015574838155159396,
+      "loss": 0.6518,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3326222222222222,
+      "grad_norm": 0.3623003718581855,
+      "learning_rate": 0.00015570056967473774,
+      "loss": 0.5724,
+      "step": 1871
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.395314412669602,
+      "learning_rate": 0.00015565273933027356,
+      "loss": 0.6518,
+      "step": 1872
+    },
+    {
+      "epoch": 0.33297777777777776,
+      "grad_norm": 0.36923396657196245,
+      "learning_rate": 0.0001556048905340596,
+      "loss": 0.6758,
+      "step": 1873
+    },
+    {
+      "epoch": 0.33315555555555554,
+      "grad_norm": 0.4146976645229107,
+      "learning_rate": 0.00015555702330196023,
+      "loss": 0.692,
+      "step": 1874
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.3637142555453114,
+      "learning_rate": 0.0001555091376498459,
+      "loss": 0.6324,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3335111111111111,
+      "grad_norm": 0.36690442470997,
+      "learning_rate": 0.00015546123359359317,
+      "loss": 0.6225,
+      "step": 1876
+    },
+    {
+      "epoch": 0.33368888888888887,
+      "grad_norm": 0.3715761701911381,
+      "learning_rate": 0.00015541331114908469,
+      "loss": 0.5832,
+      "step": 1877
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.38519927693713374,
+      "learning_rate": 0.00015536537033220924,
+      "loss": 0.6874,
+      "step": 1878
+    },
+    {
+      "epoch": 0.3340444444444444,
+      "grad_norm": 0.3855424167318192,
+      "learning_rate": 0.00015531741115886165,
+      "loss": 0.6905,
+      "step": 1879
+    },
+    {
+      "epoch": 0.3342222222222222,
+      "grad_norm": 0.37280656368730114,
+      "learning_rate": 0.00015526943364494285,
+      "loss": 0.6543,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3556684338620708,
+      "learning_rate": 0.0001552214378063599,
+      "loss": 0.6316,
+      "step": 1881
+    },
+    {
+      "epoch": 0.33457777777777775,
+      "grad_norm": 0.3518349307137234,
+      "learning_rate": 0.00015517342365902584,
+      "loss": 0.6429,
+      "step": 1882
+    },
+    {
+      "epoch": 0.33475555555555553,
+      "grad_norm": 0.3663619991362937,
+      "learning_rate": 0.0001551253912188599,
+      "loss": 0.64,
+      "step": 1883
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.3785449964233379,
+      "learning_rate": 0.0001550773405017872,
+      "loss": 0.6481,
+      "step": 1884
+    },
+    {
+      "epoch": 0.33511111111111114,
+      "grad_norm": 0.38437528154960754,
+      "learning_rate": 0.00015502927152373914,
+      "loss": 0.6993,
+      "step": 1885
+    },
+    {
+      "epoch": 0.3352888888888889,
+      "grad_norm": 0.36715399454178416,
+      "learning_rate": 0.000154981184300653,
+      "loss": 0.6627,
+      "step": 1886
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.35146008161049724,
+      "learning_rate": 0.0001549330788484722,
+      "loss": 0.6402,
+      "step": 1887
+    },
+    {
+      "epoch": 0.33564444444444447,
+      "grad_norm": 0.3773256190589999,
+      "learning_rate": 0.00015488495518314616,
+      "loss": 0.6962,
+      "step": 1888
+    },
+    {
+      "epoch": 0.33582222222222224,
+      "grad_norm": 0.3513135901792078,
+      "learning_rate": 0.00015483681332063035,
+      "loss": 0.6289,
+      "step": 1889
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3604762705030136,
+      "learning_rate": 0.0001547886532768863,
+      "loss": 0.6291,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3361777777777778,
+      "grad_norm": 0.37906680708969126,
+      "learning_rate": 0.0001547404750678815,
+      "loss": 0.6263,
+      "step": 1891
+    },
+    {
+      "epoch": 0.3363555555555556,
+      "grad_norm": 0.3391435377844623,
+      "learning_rate": 0.00015469227870958956,
+      "loss": 0.6127,
+      "step": 1892
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.4009699122530538,
+      "learning_rate": 0.00015464406421799,
+      "loss": 0.6774,
+      "step": 1893
+    },
+    {
+      "epoch": 0.3367111111111111,
+      "grad_norm": 0.37537286923319574,
+      "learning_rate": 0.00015459583160906847,
+      "loss": 0.6431,
+      "step": 1894
+    },
+    {
+      "epoch": 0.3368888888888889,
+      "grad_norm": 0.3656307483971322,
+      "learning_rate": 0.0001545475808988165,
+      "loss": 0.6615,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.3744616486364853,
+      "learning_rate": 0.0001544993121032318,
+      "loss": 0.6204,
+      "step": 1896
+    },
+    {
+      "epoch": 0.33724444444444446,
+      "grad_norm": 0.4252186356379338,
+      "learning_rate": 0.0001544510252383178,
+      "loss": 0.6453,
+      "step": 1897
+    },
+    {
+      "epoch": 0.33742222222222223,
+      "grad_norm": 0.3627415682909013,
+      "learning_rate": 0.0001544027203200842,
+      "loss": 0.6251,
+      "step": 1898
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.37295138064373845,
+      "learning_rate": 0.00015435439736454653,
+      "loss": 0.6258,
+      "step": 1899
+    },
+    {
+      "epoch": 0.3377777777777778,
+      "grad_norm": 0.354490949120553,
+      "learning_rate": 0.00015430605638772633,
+      "loss": 0.6094,
+      "step": 1900
+    },
+    {
+      "epoch": 0.33795555555555556,
+      "grad_norm": 0.3854865153643987,
+      "learning_rate": 0.00015425769740565114,
+      "loss": 0.6086,
+      "step": 1901
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.3549083537040871,
+      "learning_rate": 0.00015420932043435447,
+      "loss": 0.63,
+      "step": 1902
+    },
+    {
+      "epoch": 0.3383111111111111,
+      "grad_norm": 0.38869600865278875,
+      "learning_rate": 0.00015416092548987576,
+      "loss": 0.6444,
+      "step": 1903
+    },
+    {
+      "epoch": 0.3384888888888889,
+      "grad_norm": 0.39882571898590363,
+      "learning_rate": 0.0001541125125882604,
+      "loss": 0.7058,
+      "step": 1904
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.38074544193279036,
+      "learning_rate": 0.00015406408174555976,
+      "loss": 0.6836,
+      "step": 1905
+    },
+    {
+      "epoch": 0.33884444444444445,
+      "grad_norm": 0.3782464583309121,
+      "learning_rate": 0.00015401563297783122,
+      "loss": 0.6966,
+      "step": 1906
+    },
+    {
+      "epoch": 0.3390222222222222,
+      "grad_norm": 0.37908419739217725,
+      "learning_rate": 0.000153967166301138,
+      "loss": 0.6565,
+      "step": 1907
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3562745958188283,
+      "learning_rate": 0.00015391868173154932,
+      "loss": 0.6834,
+      "step": 1908
+    },
+    {
+      "epoch": 0.3393777777777778,
+      "grad_norm": 0.3709950236861562,
+      "learning_rate": 0.0001538701792851403,
+      "loss": 0.6368,
+      "step": 1909
+    },
+    {
+      "epoch": 0.33955555555555555,
+      "grad_norm": 0.3593647531273344,
+      "learning_rate": 0.00015382165897799197,
+      "loss": 0.6403,
+      "step": 1910
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.36660696672625154,
+      "learning_rate": 0.00015377312082619134,
+      "loss": 0.6572,
+      "step": 1911
+    },
+    {
+      "epoch": 0.3399111111111111,
+      "grad_norm": 0.3712075544687035,
+      "learning_rate": 0.00015372456484583134,
+      "loss": 0.5955,
+      "step": 1912
+    },
+    {
+      "epoch": 0.3400888888888889,
+      "grad_norm": 0.37010885752726813,
+      "learning_rate": 0.0001536759910530107,
+      "loss": 0.6087,
+      "step": 1913
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.3684350141830777,
+      "learning_rate": 0.0001536273994638342,
+      "loss": 0.6141,
+      "step": 1914
+    },
+    {
+      "epoch": 0.34044444444444444,
+      "grad_norm": 0.35812770243586833,
+      "learning_rate": 0.00015357879009441242,
+      "loss": 0.6304,
+      "step": 1915
+    },
+    {
+      "epoch": 0.3406222222222222,
+      "grad_norm": 0.3857271645225178,
+      "learning_rate": 0.0001535301629608619,
+      "loss": 0.6721,
+      "step": 1916
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.3948624529959523,
+      "learning_rate": 0.00015348151807930506,
+      "loss": 0.6069,
+      "step": 1917
+    },
+    {
+      "epoch": 0.34097777777777777,
+      "grad_norm": 0.37789159457866367,
+      "learning_rate": 0.00015343285546587013,
+      "loss": 0.682,
+      "step": 1918
+    },
+    {
+      "epoch": 0.34115555555555555,
+      "grad_norm": 0.383158960550221,
+      "learning_rate": 0.00015338417513669126,
+      "loss": 0.682,
+      "step": 1919
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.3809544849048687,
+      "learning_rate": 0.00015333547710790851,
+      "loss": 0.6805,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3415111111111111,
+      "grad_norm": 0.35530043473199124,
+      "learning_rate": 0.0001532867613956678,
+      "loss": 0.6734,
+      "step": 1921
+    },
+    {
+      "epoch": 0.3416888888888889,
+      "grad_norm": 0.3692528600382463,
+      "learning_rate": 0.00015323802801612093,
+      "loss": 0.6705,
+      "step": 1922
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.3623339603031708,
+      "learning_rate": 0.00015318927698542543,
+      "loss": 0.6855,
+      "step": 1923
+    },
+    {
+      "epoch": 0.34204444444444443,
+      "grad_norm": 0.39204956678696956,
+      "learning_rate": 0.00015314050831974484,
+      "loss": 0.671,
+      "step": 1924
+    },
+    {
+      "epoch": 0.3422222222222222,
+      "grad_norm": 0.3417444395877495,
+      "learning_rate": 0.00015309172203524854,
+      "loss": 0.6216,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3883036894526186,
+      "learning_rate": 0.0001530429181481116,
+      "loss": 0.6737,
+      "step": 1926
+    },
+    {
+      "epoch": 0.34257777777777776,
+      "grad_norm": 0.3590628431997195,
+      "learning_rate": 0.000152994096674515,
+      "loss": 0.6563,
+      "step": 1927
+    },
+    {
+      "epoch": 0.34275555555555554,
+      "grad_norm": 0.38054864056881726,
+      "learning_rate": 0.0001529452576306457,
+      "loss": 0.6054,
+      "step": 1928
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.3797573084893008,
+      "learning_rate": 0.00015289640103269625,
+      "loss": 0.7011,
+      "step": 1929
+    },
+    {
+      "epoch": 0.3431111111111111,
+      "grad_norm": 0.35130540715217634,
+      "learning_rate": 0.0001528475268968652,
+      "loss": 0.5918,
+      "step": 1930
+    },
+    {
+      "epoch": 0.34328888888888887,
+      "grad_norm": 0.3549938374042644,
+      "learning_rate": 0.0001527986352393568,
+      "loss": 0.6543,
+      "step": 1931
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.3743413391677129,
+      "learning_rate": 0.00015274972607638113,
+      "loss": 0.6457,
+      "step": 1932
+    },
+    {
+      "epoch": 0.3436444444444444,
+      "grad_norm": 0.3768501226984263,
+      "learning_rate": 0.00015270079942415418,
+      "loss": 0.5915,
+      "step": 1933
+    },
+    {
+      "epoch": 0.3438222222222222,
+      "grad_norm": 0.3761269969345535,
+      "learning_rate": 0.00015265185529889758,
+      "loss": 0.6547,
+      "step": 1934
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.38614267511717193,
+      "learning_rate": 0.00015260289371683884,
+      "loss": 0.6642,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3441777777777778,
+      "grad_norm": 0.3771827789566294,
+      "learning_rate": 0.00015255391469421128,
+      "loss": 0.6686,
+      "step": 1936
+    },
+    {
+      "epoch": 0.3443555555555556,
+      "grad_norm": 0.3905826974516789,
+      "learning_rate": 0.00015250491824725398,
+      "loss": 0.6985,
+      "step": 1937
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.36065501029392516,
+      "learning_rate": 0.00015245590439221172,
+      "loss": 0.6118,
+      "step": 1938
+    },
+    {
+      "epoch": 0.34471111111111113,
+      "grad_norm": 0.34729715327884514,
+      "learning_rate": 0.00015240687314533515,
+      "loss": 0.6258,
+      "step": 1939
+    },
+    {
+      "epoch": 0.3448888888888889,
+      "grad_norm": 0.37259047859861416,
+      "learning_rate": 0.00015235782452288068,
+      "loss": 0.6783,
+      "step": 1940
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.3814504904666184,
+      "learning_rate": 0.0001523087585411104,
+      "loss": 0.6575,
+      "step": 1941
+    },
+    {
+      "epoch": 0.34524444444444446,
+      "grad_norm": 0.3781408065572556,
+      "learning_rate": 0.0001522596752162923,
+      "loss": 0.6921,
+      "step": 1942
+    },
+    {
+      "epoch": 0.34542222222222224,
+      "grad_norm": 0.35917518608478877,
+      "learning_rate": 0.00015221057456469994,
+      "loss": 0.6535,
+      "step": 1943
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3716373107186432,
+      "learning_rate": 0.00015216145660261273,
+      "loss": 0.6189,
+      "step": 1944
+    },
+    {
+      "epoch": 0.3457777777777778,
+      "grad_norm": 0.3587359557996415,
+      "learning_rate": 0.00015211232134631586,
+      "loss": 0.6339,
+      "step": 1945
+    },
+    {
+      "epoch": 0.34595555555555557,
+      "grad_norm": 0.36367310244560014,
+      "learning_rate": 0.00015206316881210015,
+      "loss": 0.6455,
+      "step": 1946
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.3495511971982955,
+      "learning_rate": 0.00015201399901626218,
+      "loss": 0.6328,
+      "step": 1947
+    },
+    {
+      "epoch": 0.3463111111111111,
+      "grad_norm": 0.37986954406834667,
+      "learning_rate": 0.0001519648119751043,
+      "loss": 0.6349,
+      "step": 1948
+    },
+    {
+      "epoch": 0.3464888888888889,
+      "grad_norm": 0.3864259166335274,
+      "learning_rate": 0.00015191560770493458,
+      "loss": 0.6292,
+      "step": 1949
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.3646330412340634,
+      "learning_rate": 0.00015186638622206674,
+      "loss": 0.7031,
+      "step": 1950
+    },
+    {
+      "epoch": 0.34684444444444446,
+      "grad_norm": 0.3480623094221179,
+      "learning_rate": 0.0001518171475428202,
+      "loss": 0.6254,
+      "step": 1951
+    },
+    {
+      "epoch": 0.34702222222222223,
+      "grad_norm": 0.36775586963200424,
+      "learning_rate": 0.00015176789168352018,
+      "loss": 0.6697,
+      "step": 1952
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.3599707140978756,
+      "learning_rate": 0.0001517186186604975,
+      "loss": 0.6287,
+      "step": 1953
+    },
+    {
+      "epoch": 0.3473777777777778,
+      "grad_norm": 0.36247623051311173,
+      "learning_rate": 0.0001516693284900887,
+      "loss": 0.6461,
+      "step": 1954
+    },
+    {
+      "epoch": 0.34755555555555556,
+      "grad_norm": 0.35170666984812476,
+      "learning_rate": 0.000151620021188636,
+      "loss": 0.6576,
+      "step": 1955
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.3524731712393581,
+      "learning_rate": 0.00015157069677248738,
+      "loss": 0.6521,
+      "step": 1956
+    },
+    {
+      "epoch": 0.3479111111111111,
+      "grad_norm": 0.35716231359740586,
+      "learning_rate": 0.00015152135525799633,
+      "loss": 0.6612,
+      "step": 1957
+    },
+    {
+      "epoch": 0.3480888888888889,
+      "grad_norm": 0.3515983779832405,
+      "learning_rate": 0.00015147199666152222,
+      "loss": 0.6659,
+      "step": 1958
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.39883077487011037,
+      "learning_rate": 0.00015142262099942982,
+      "loss": 0.6292,
+      "step": 1959
+    },
+    {
+      "epoch": 0.34844444444444445,
+      "grad_norm": 0.349348810324865,
+      "learning_rate": 0.00015137322828808982,
+      "loss": 0.624,
+      "step": 1960
+    },
+    {
+      "epoch": 0.3486222222222222,
+      "grad_norm": 0.37855580962184154,
+      "learning_rate": 0.0001513238185438784,
+      "loss": 0.6336,
+      "step": 1961
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3830086106065981,
+      "learning_rate": 0.00015127439178317745,
+      "loss": 0.6606,
+      "step": 1962
+    },
+    {
+      "epoch": 0.3489777777777778,
+      "grad_norm": 0.37220350745279995,
+      "learning_rate": 0.0001512249480223745,
+      "loss": 0.6613,
+      "step": 1963
+    },
+    {
+      "epoch": 0.34915555555555555,
+      "grad_norm": 0.3686382842837492,
+      "learning_rate": 0.00015117548727786265,
+      "loss": 0.6881,
+      "step": 1964
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.37594924882120045,
+      "learning_rate": 0.00015112600956604074,
+      "loss": 0.683,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3495111111111111,
+      "grad_norm": 0.38365558467300287,
+      "learning_rate": 0.00015107651490331317,
+      "loss": 0.6648,
+      "step": 1966
+    },
+    {
+      "epoch": 0.3496888888888889,
+      "grad_norm": 0.3490925018748636,
+      "learning_rate": 0.00015102700330609,
+      "loss": 0.6277,
+      "step": 1967
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.33109148394762006,
+      "learning_rate": 0.0001509774747907868,
+      "loss": 0.5762,
+      "step": 1968
+    },
+    {
+      "epoch": 0.35004444444444444,
+      "grad_norm": 0.35354886274804637,
+      "learning_rate": 0.00015092792937382483,
+      "loss": 0.5943,
+      "step": 1969
+    },
+    {
+      "epoch": 0.3502222222222222,
+      "grad_norm": 0.35857196538644953,
+      "learning_rate": 0.000150878367071631,
+      "loss": 0.609,
+      "step": 1970
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.36722178664133814,
+      "learning_rate": 0.00015082878790063776,
+      "loss": 0.605,
+      "step": 1971
+    },
+    {
+      "epoch": 0.35057777777777777,
+      "grad_norm": 0.3689634845632975,
+      "learning_rate": 0.00015077919187728313,
+      "loss": 0.6549,
+      "step": 1972
+    },
+    {
+      "epoch": 0.35075555555555554,
+      "grad_norm": 0.42575878422861924,
+      "learning_rate": 0.00015072957901801076,
+      "loss": 0.6257,
+      "step": 1973
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.36567702736362295,
+      "learning_rate": 0.00015067994933926985,
+      "loss": 0.6575,
+      "step": 1974
+    },
+    {
+      "epoch": 0.3511111111111111,
+      "grad_norm": 0.39874234622292376,
+      "learning_rate": 0.00015063030285751526,
+      "loss": 0.7159,
+      "step": 1975
+    },
+    {
+      "epoch": 0.3512888888888889,
+      "grad_norm": 0.3935848186973221,
+      "learning_rate": 0.00015058063958920726,
+      "loss": 0.6284,
+      "step": 1976
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.3317955078156986,
+      "learning_rate": 0.00015053095955081184,
+      "loss": 0.6261,
+      "step": 1977
+    },
+    {
+      "epoch": 0.3516444444444444,
+      "grad_norm": 0.3683979992895569,
+      "learning_rate": 0.0001504812627588005,
+      "loss": 0.5753,
+      "step": 1978
+    },
+    {
+      "epoch": 0.3518222222222222,
+      "grad_norm": 0.3633136981036898,
+      "learning_rate": 0.00015043154922965028,
+      "loss": 0.6221,
+      "step": 1979
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3724394123561364,
+      "learning_rate": 0.00015038181897984374,
+      "loss": 0.6411,
+      "step": 1980
+    },
+    {
+      "epoch": 0.35217777777777776,
+      "grad_norm": 0.3591757601118717,
+      "learning_rate": 0.00015033207202586906,
+      "loss": 0.6459,
+      "step": 1981
+    },
+    {
+      "epoch": 0.35235555555555553,
+      "grad_norm": 0.3712084361937139,
+      "learning_rate": 0.0001502823083842199,
+      "loss": 0.6668,
+      "step": 1982
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.3606712812359245,
+      "learning_rate": 0.00015023252807139548,
+      "loss": 0.6445,
+      "step": 1983
+    },
+    {
+      "epoch": 0.3527111111111111,
+      "grad_norm": 0.3731001652780124,
+      "learning_rate": 0.0001501827311039005,
+      "loss": 0.6109,
+      "step": 1984
+    },
+    {
+      "epoch": 0.35288888888888886,
+      "grad_norm": 0.3531230524656085,
+      "learning_rate": 0.00015013291749824527,
+      "loss": 0.6454,
+      "step": 1985
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.3610863997758842,
+      "learning_rate": 0.00015008308727094554,
+      "loss": 0.6521,
+      "step": 1986
+    },
+    {
+      "epoch": 0.35324444444444447,
+      "grad_norm": 0.38521408902713933,
+      "learning_rate": 0.0001500332404385226,
+      "loss": 0.6026,
+      "step": 1987
+    },
+    {
+      "epoch": 0.35342222222222225,
+      "grad_norm": 0.35711335979526543,
+      "learning_rate": 0.00014998337701750325,
+      "loss": 0.6239,
+      "step": 1988
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.3874402203662732,
+      "learning_rate": 0.00014993349702441977,
+      "loss": 0.7266,
+      "step": 1989
+    },
+    {
+      "epoch": 0.3537777777777778,
+      "grad_norm": 0.3695911002313585,
+      "learning_rate": 0.00014988360047580996,
+      "loss": 0.6324,
+      "step": 1990
+    },
+    {
+      "epoch": 0.3539555555555556,
+      "grad_norm": 0.3635293848274628,
+      "learning_rate": 0.00014983368738821713,
+      "loss": 0.6484,
+      "step": 1991
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.3613017541698347,
+      "learning_rate": 0.00014978375777818995,
+      "loss": 0.6448,
+      "step": 1992
+    },
+    {
+      "epoch": 0.35431111111111113,
+      "grad_norm": 0.33440682069469924,
+      "learning_rate": 0.00014973381166228272,
+      "loss": 0.6027,
+      "step": 1993
+    },
+    {
+      "epoch": 0.3544888888888889,
+      "grad_norm": 0.36943387967287367,
+      "learning_rate": 0.00014968384905705517,
+      "loss": 0.6233,
+      "step": 1994
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.35193679253771154,
+      "learning_rate": 0.0001496338699790724,
+      "loss": 0.6621,
+      "step": 1995
+    },
+    {
+      "epoch": 0.35484444444444446,
+      "grad_norm": 0.34982242455112916,
+      "learning_rate": 0.0001495838744449051,
+      "loss": 0.6162,
+      "step": 1996
+    },
+    {
+      "epoch": 0.35502222222222224,
+      "grad_norm": 0.3510048176570386,
+      "learning_rate": 0.0001495338624711294,
+      "loss": 0.6294,
+      "step": 1997
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.37252284842904776,
+      "learning_rate": 0.00014948383407432678,
+      "loss": 0.6425,
+      "step": 1998
+    },
+    {
+      "epoch": 0.3553777777777778,
+      "grad_norm": 0.33584495292898014,
+      "learning_rate": 0.00014943378927108426,
+      "loss": 0.6205,
+      "step": 1999
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.3833097467211567,
+      "learning_rate": 0.00014938372807799425,
+      "loss": 0.5696,
+      "step": 2000
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.3967682155529881,
+      "learning_rate": 0.0001493336505116546,
+      "loss": 0.6501,
+      "step": 2001
+    },
+    {
+      "epoch": 0.3559111111111111,
+      "grad_norm": 0.34637316067199425,
+      "learning_rate": 0.0001492835565886687,
+      "loss": 0.625,
+      "step": 2002
+    },
+    {
+      "epoch": 0.3560888888888889,
+      "grad_norm": 0.348732896387513,
+      "learning_rate": 0.0001492334463256452,
+      "loss": 0.6295,
+      "step": 2003
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.33839273097474754,
+      "learning_rate": 0.0001491833197391982,
+      "loss": 0.6521,
+      "step": 2004
+    },
+    {
+      "epoch": 0.35644444444444445,
+      "grad_norm": 0.34478921210181634,
+      "learning_rate": 0.00014913317684594728,
+      "loss": 0.6135,
+      "step": 2005
+    },
+    {
+      "epoch": 0.35662222222222223,
+      "grad_norm": 0.3350781131097185,
+      "learning_rate": 0.00014908301766251739,
+      "loss": 0.6132,
+      "step": 2006
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3457428656406698,
+      "learning_rate": 0.00014903284220553885,
+      "loss": 0.6265,
+      "step": 2007
+    },
+    {
+      "epoch": 0.3569777777777778,
+      "grad_norm": 0.3441985475780204,
+      "learning_rate": 0.00014898265049164748,
+      "loss": 0.6435,
+      "step": 2008
+    },
+    {
+      "epoch": 0.35715555555555556,
+      "grad_norm": 0.36776963685188147,
+      "learning_rate": 0.00014893244253748436,
+      "loss": 0.6647,
+      "step": 2009
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.35952266322726845,
+      "learning_rate": 0.00014888221835969605,
+      "loss": 0.6631,
+      "step": 2010
+    },
+    {
+      "epoch": 0.3575111111111111,
+      "grad_norm": 0.3659956712033054,
+      "learning_rate": 0.00014883197797493442,
+      "loss": 0.6265,
+      "step": 2011
+    },
+    {
+      "epoch": 0.3576888888888889,
+      "grad_norm": 0.3467382121018198,
+      "learning_rate": 0.00014878172139985675,
+      "loss": 0.6349,
+      "step": 2012
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.3957784753009316,
+      "learning_rate": 0.00014873144865112573,
+      "loss": 0.7023,
+      "step": 2013
+    },
+    {
+      "epoch": 0.35804444444444444,
+      "grad_norm": 0.35072125070146143,
+      "learning_rate": 0.0001486811597454093,
+      "loss": 0.6596,
+      "step": 2014
+    },
+    {
+      "epoch": 0.3582222222222222,
+      "grad_norm": 0.3632568771859667,
+      "learning_rate": 0.00014863085469938084,
+      "loss": 0.6317,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3810085439368567,
+      "learning_rate": 0.00014858053352971912,
+      "loss": 0.694,
+      "step": 2016
+    },
+    {
+      "epoch": 0.3585777777777778,
+      "grad_norm": 0.3938534272004421,
+      "learning_rate": 0.00014853019625310813,
+      "loss": 0.6896,
+      "step": 2017
+    },
+    {
+      "epoch": 0.35875555555555555,
+      "grad_norm": 0.3722351547253413,
+      "learning_rate": 0.0001484798428862373,
+      "loss": 0.617,
+      "step": 2018
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.36278454951864497,
+      "learning_rate": 0.00014842947344580133,
+      "loss": 0.6168,
+      "step": 2019
+    },
+    {
+      "epoch": 0.3591111111111111,
+      "grad_norm": 0.35893904729176807,
+      "learning_rate": 0.00014837908794850034,
+      "loss": 0.6224,
+      "step": 2020
+    },
+    {
+      "epoch": 0.3592888888888889,
+      "grad_norm": 0.382214848569764,
+      "learning_rate": 0.00014832868641103967,
+      "loss": 0.6681,
+      "step": 2021
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.3582634921068547,
+      "learning_rate": 0.00014827826885013007,
+      "loss": 0.6311,
+      "step": 2022
+    },
+    {
+      "epoch": 0.35964444444444443,
+      "grad_norm": 0.38137826123261026,
+      "learning_rate": 0.00014822783528248753,
+      "loss": 0.6363,
+      "step": 2023
+    },
+    {
+      "epoch": 0.3598222222222222,
+      "grad_norm": 0.36474734770259776,
+      "learning_rate": 0.00014817738572483338,
+      "loss": 0.6498,
+      "step": 2024
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.36072861571456216,
+      "learning_rate": 0.00014812692019389425,
+      "loss": 0.6148,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36017777777777776,
+      "grad_norm": 0.3716504575880546,
+      "learning_rate": 0.00014807643870640207,
+      "loss": 0.6213,
+      "step": 2026
+    },
+    {
+      "epoch": 0.36035555555555554,
+      "grad_norm": 0.349790460516821,
+      "learning_rate": 0.00014802594127909404,
+      "loss": 0.6617,
+      "step": 2027
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.36636570634623195,
+      "learning_rate": 0.00014797542792871265,
+      "loss": 0.6596,
+      "step": 2028
+    },
+    {
+      "epoch": 0.3607111111111111,
+      "grad_norm": 0.3225352366135295,
+      "learning_rate": 0.0001479248986720057,
+      "loss": 0.5834,
+      "step": 2029
+    },
+    {
+      "epoch": 0.36088888888888887,
+      "grad_norm": 0.3779073813598705,
+      "learning_rate": 0.00014787435352572623,
+      "loss": 0.6307,
+      "step": 2030
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.3642173384123533,
+      "learning_rate": 0.00014782379250663255,
+      "loss": 0.6531,
+      "step": 2031
+    },
+    {
+      "epoch": 0.3612444444444444,
+      "grad_norm": 0.3640970703178707,
+      "learning_rate": 0.0001477732156314883,
+      "loss": 0.6791,
+      "step": 2032
+    },
+    {
+      "epoch": 0.3614222222222222,
+      "grad_norm": 0.36313461672239566,
+      "learning_rate": 0.00014772262291706223,
+      "loss": 0.6809,
+      "step": 2033
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.37296127294131926,
+      "learning_rate": 0.00014767201438012847,
+      "loss": 0.6279,
+      "step": 2034
+    },
+    {
+      "epoch": 0.36177777777777775,
+      "grad_norm": 0.3835029026042995,
+      "learning_rate": 0.00014762139003746637,
+      "loss": 0.6554,
+      "step": 2035
+    },
+    {
+      "epoch": 0.36195555555555553,
+      "grad_norm": 0.37491201266683005,
+      "learning_rate": 0.0001475707499058605,
+      "loss": 0.663,
+      "step": 2036
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.3924362706200794,
+      "learning_rate": 0.00014752009400210067,
+      "loss": 0.6584,
+      "step": 2037
+    },
+    {
+      "epoch": 0.3623111111111111,
+      "grad_norm": 0.3662107952588624,
+      "learning_rate": 0.0001474694223429819,
+      "loss": 0.6446,
+      "step": 2038
+    },
+    {
+      "epoch": 0.3624888888888889,
+      "grad_norm": 0.3734678879172702,
+      "learning_rate": 0.0001474187349453045,
+      "loss": 0.6816,
+      "step": 2039
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.3551390433542849,
+      "learning_rate": 0.00014736803182587398,
+      "loss": 0.6803,
+      "step": 2040
+    },
+    {
+      "epoch": 0.36284444444444447,
+      "grad_norm": 0.3539108628447298,
+      "learning_rate": 0.0001473173130015009,
+      "loss": 0.6232,
+      "step": 2041
+    },
+    {
+      "epoch": 0.36302222222222225,
+      "grad_norm": 0.3497645277230926,
+      "learning_rate": 0.00014726657848900129,
+      "loss": 0.6324,
+      "step": 2042
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.3497048825422562,
+      "learning_rate": 0.00014721582830519623,
+      "loss": 0.6427,
+      "step": 2043
+    },
+    {
+      "epoch": 0.3633777777777778,
+      "grad_norm": 0.38204486391191084,
+      "learning_rate": 0.00014716506246691195,
+      "loss": 0.6234,
+      "step": 2044
+    },
+    {
+      "epoch": 0.3635555555555556,
+      "grad_norm": 0.353534799555291,
+      "learning_rate": 0.00014711428099098002,
+      "loss": 0.5836,
+      "step": 2045
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.41391749114138887,
+      "learning_rate": 0.00014706348389423708,
+      "loss": 0.6264,
+      "step": 2046
+    },
+    {
+      "epoch": 0.36391111111111113,
+      "grad_norm": 0.3756400198155271,
+      "learning_rate": 0.000147012671193525,
+      "loss": 0.6756,
+      "step": 2047
+    },
+    {
+      "epoch": 0.3640888888888889,
+      "grad_norm": 0.3847128408739994,
+      "learning_rate": 0.0001469618429056908,
+      "loss": 0.6295,
+      "step": 2048
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.3591691226806434,
+      "learning_rate": 0.00014691099904758667,
+      "loss": 0.633,
+      "step": 2049
+    },
+    {
+      "epoch": 0.36444444444444446,
+      "grad_norm": 0.36768580315706306,
+      "learning_rate": 0.00014686013963607,
+      "loss": 0.6466,
+      "step": 2050
+    },
+    {
+      "epoch": 0.36462222222222224,
+      "grad_norm": 0.3619506776953442,
+      "learning_rate": 0.00014680926468800326,
+      "loss": 0.6543,
+      "step": 2051
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.36832799311030334,
+      "learning_rate": 0.00014675837422025413,
+      "loss": 0.6624,
+      "step": 2052
+    },
+    {
+      "epoch": 0.3649777777777778,
+      "grad_norm": 0.38626330524947955,
+      "learning_rate": 0.00014670746824969544,
+      "loss": 0.6506,
+      "step": 2053
+    },
+    {
+      "epoch": 0.36515555555555557,
+      "grad_norm": 0.3846826512524197,
+      "learning_rate": 0.00014665654679320511,
+      "loss": 0.7167,
+      "step": 2054
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.36947120198843547,
+      "learning_rate": 0.00014660560986766623,
+      "loss": 0.6366,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3655111111111111,
+      "grad_norm": 0.3705785771649254,
+      "learning_rate": 0.00014655465748996703,
+      "loss": 0.6513,
+      "step": 2056
+    },
+    {
+      "epoch": 0.3656888888888889,
+      "grad_norm": 0.3737980056067438,
+      "learning_rate": 0.00014650368967700084,
+      "loss": 0.6602,
+      "step": 2057
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.3585112543555933,
+      "learning_rate": 0.00014645270644566617,
+      "loss": 0.6848,
+      "step": 2058
+    },
+    {
+      "epoch": 0.36604444444444445,
+      "grad_norm": 0.3636451331076331,
+      "learning_rate": 0.0001464017078128665,
+      "loss": 0.6626,
+      "step": 2059
+    },
+    {
+      "epoch": 0.3662222222222222,
+      "grad_norm": 0.3538149131040327,
+      "learning_rate": 0.00014635069379551055,
+      "loss": 0.6373,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.34539514311107683,
+      "learning_rate": 0.00014629966441051208,
+      "loss": 0.6186,
+      "step": 2061
+    },
+    {
+      "epoch": 0.3665777777777778,
+      "grad_norm": 0.36299689300073207,
+      "learning_rate": 0.00014624861967478997,
+      "loss": 0.6342,
+      "step": 2062
+    },
+    {
+      "epoch": 0.36675555555555556,
+      "grad_norm": 0.37152935857686487,
+      "learning_rate": 0.00014619755960526817,
+      "loss": 0.6555,
+      "step": 2063
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.3640051531385131,
+      "learning_rate": 0.00014614648421887574,
+      "loss": 0.6064,
+      "step": 2064
+    },
+    {
+      "epoch": 0.3671111111111111,
+      "grad_norm": 0.39013013963600696,
+      "learning_rate": 0.00014609539353254678,
+      "loss": 0.6103,
+      "step": 2065
+    },
+    {
+      "epoch": 0.3672888888888889,
+      "grad_norm": 0.3558377578251637,
+      "learning_rate": 0.00014604428756322048,
+      "loss": 0.6376,
+      "step": 2066
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.36082133986793363,
+      "learning_rate": 0.00014599316632784112,
+      "loss": 0.664,
+      "step": 2067
+    },
+    {
+      "epoch": 0.36764444444444444,
+      "grad_norm": 0.3744793863193506,
+      "learning_rate": 0.00014594202984335804,
+      "loss": 0.6245,
+      "step": 2068
+    },
+    {
+      "epoch": 0.3678222222222222,
+      "grad_norm": 0.34143396881227045,
+      "learning_rate": 0.00014589087812672558,
+      "loss": 0.6147,
+      "step": 2069
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.34854549704357407,
+      "learning_rate": 0.00014583971119490316,
+      "loss": 0.5693,
+      "step": 2070
+    },
+    {
+      "epoch": 0.36817777777777777,
+      "grad_norm": 0.3487247726779616,
+      "learning_rate": 0.00014578852906485531,
+      "loss": 0.6019,
+      "step": 2071
+    },
+    {
+      "epoch": 0.36835555555555555,
+      "grad_norm": 0.3627248460169325,
+      "learning_rate": 0.0001457373317535515,
+      "loss": 0.6591,
+      "step": 2072
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.35489116437132423,
+      "learning_rate": 0.0001456861192779663,
+      "loss": 0.6399,
+      "step": 2073
+    },
+    {
+      "epoch": 0.3687111111111111,
+      "grad_norm": 0.3898380138409434,
+      "learning_rate": 0.0001456348916550793,
+      "loss": 0.6699,
+      "step": 2074
+    },
+    {
+      "epoch": 0.3688888888888889,
+      "grad_norm": 0.34963253444716613,
+      "learning_rate": 0.00014558364890187501,
+      "loss": 0.6137,
+      "step": 2075
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.35621972227477494,
+      "learning_rate": 0.0001455323910353431,
+      "loss": 0.6873,
+      "step": 2076
+    },
+    {
+      "epoch": 0.36924444444444443,
+      "grad_norm": 0.3609739517191583,
+      "learning_rate": 0.0001454811180724782,
+      "loss": 0.639,
+      "step": 2077
+    },
+    {
+      "epoch": 0.3694222222222222,
+      "grad_norm": 0.35324307876701094,
+      "learning_rate": 0.00014542983003027995,
+      "loss": 0.6049,
+      "step": 2078
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3684218240588309,
+      "learning_rate": 0.00014537852692575294,
+      "loss": 0.6435,
+      "step": 2079
+    },
+    {
+      "epoch": 0.36977777777777776,
+      "grad_norm": 0.38440495160733995,
+      "learning_rate": 0.00014532720877590683,
+      "loss": 0.642,
+      "step": 2080
+    },
+    {
+      "epoch": 0.36995555555555554,
+      "grad_norm": 0.38862218740196974,
+      "learning_rate": 0.00014527587559775616,
+      "loss": 0.6687,
+      "step": 2081
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.3512888006933407,
+      "learning_rate": 0.00014522452740832063,
+      "loss": 0.6408,
+      "step": 2082
+    },
+    {
+      "epoch": 0.3703111111111111,
+      "grad_norm": 0.40489115076877097,
+      "learning_rate": 0.0001451731642246247,
+      "loss": 0.636,
+      "step": 2083
+    },
+    {
+      "epoch": 0.37048888888888887,
+      "grad_norm": 0.3642348665286737,
+      "learning_rate": 0.000145121786063698,
+      "loss": 0.6552,
+      "step": 2084
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.36209778743433185,
+      "learning_rate": 0.00014507039294257498,
+      "loss": 0.593,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3708444444444444,
+      "grad_norm": 0.3548057008262031,
+      "learning_rate": 0.00014501898487829514,
+      "loss": 0.5857,
+      "step": 2086
+    },
+    {
+      "epoch": 0.3710222222222222,
+      "grad_norm": 0.35657636965959416,
+      "learning_rate": 0.0001449675618879029,
+      "loss": 0.6355,
+      "step": 2087
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3584592484074798,
+      "learning_rate": 0.0001449161239884476,
+      "loss": 0.6573,
+      "step": 2088
+    },
+    {
+      "epoch": 0.37137777777777775,
+      "grad_norm": 0.3585384982314153,
+      "learning_rate": 0.00014486467119698357,
+      "loss": 0.6394,
+      "step": 2089
+    },
+    {
+      "epoch": 0.37155555555555553,
+      "grad_norm": 0.39821068958150124,
+      "learning_rate": 0.00014481320353057007,
+      "loss": 0.6313,
+      "step": 2090
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.3659565415690751,
+      "learning_rate": 0.00014476172100627127,
+      "loss": 0.6097,
+      "step": 2091
+    },
+    {
+      "epoch": 0.37191111111111114,
+      "grad_norm": 0.4015216229336997,
+      "learning_rate": 0.00014471022364115628,
+      "loss": 0.6373,
+      "step": 2092
+    },
+    {
+      "epoch": 0.3720888888888889,
+      "grad_norm": 0.3999333247518061,
+      "learning_rate": 0.00014465871145229913,
+      "loss": 0.6955,
+      "step": 2093
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.3992134508434656,
+      "learning_rate": 0.00014460718445677876,
+      "loss": 0.6792,
+      "step": 2094
+    },
+    {
+      "epoch": 0.37244444444444447,
+      "grad_norm": 0.38087493710745773,
+      "learning_rate": 0.00014455564267167905,
+      "loss": 0.6094,
+      "step": 2095
+    },
+    {
+      "epoch": 0.37262222222222224,
+      "grad_norm": 0.35604088117839133,
+      "learning_rate": 0.00014450408611408873,
+      "loss": 0.5776,
+      "step": 2096
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3672131924064539,
+      "learning_rate": 0.00014445251480110145,
+      "loss": 0.6622,
+      "step": 2097
+    },
+    {
+      "epoch": 0.3729777777777778,
+      "grad_norm": 0.35990351493537415,
+      "learning_rate": 0.00014440092874981576,
+      "loss": 0.6673,
+      "step": 2098
+    },
+    {
+      "epoch": 0.3731555555555556,
+      "grad_norm": 0.364851988251713,
+      "learning_rate": 0.00014434932797733515,
+      "loss": 0.6446,
+      "step": 2099
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.3695891243315968,
+      "learning_rate": 0.00014429771250076785,
+      "loss": 0.6858,
+      "step": 2100
+    },
+    {
+      "epoch": 0.3735111111111111,
+      "grad_norm": 0.36310420904474155,
+      "learning_rate": 0.00014424608233722707,
+      "loss": 0.6583,
+      "step": 2101
+    },
+    {
+      "epoch": 0.3736888888888889,
+      "grad_norm": 0.3532427224281201,
+      "learning_rate": 0.0001441944375038309,
+      "loss": 0.6244,
+      "step": 2102
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.3641283155611553,
+      "learning_rate": 0.00014414277801770223,
+      "loss": 0.6413,
+      "step": 2103
+    },
+    {
+      "epoch": 0.37404444444444446,
+      "grad_norm": 0.37282122080784824,
+      "learning_rate": 0.00014409110389596887,
+      "loss": 0.6587,
+      "step": 2104
+    },
+    {
+      "epoch": 0.37422222222222223,
+      "grad_norm": 0.3618878863527839,
+      "learning_rate": 0.00014403941515576344,
+      "loss": 0.6073,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3629842550856872,
+      "learning_rate": 0.0001439877118142234,
+      "loss": 0.6559,
+      "step": 2106
+    },
+    {
+      "epoch": 0.3745777777777778,
+      "grad_norm": 0.37984758502006777,
+      "learning_rate": 0.0001439359938884911,
+      "loss": 0.6562,
+      "step": 2107
+    },
+    {
+      "epoch": 0.37475555555555556,
+      "grad_norm": 0.3602592501165416,
+      "learning_rate": 0.0001438842613957137,
+      "loss": 0.5927,
+      "step": 2108
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.3649379205527932,
+      "learning_rate": 0.00014383251435304314,
+      "loss": 0.5901,
+      "step": 2109
+    },
+    {
+      "epoch": 0.3751111111111111,
+      "grad_norm": 0.3736945232348994,
+      "learning_rate": 0.0001437807527776363,
+      "loss": 0.6318,
+      "step": 2110
+    },
+    {
+      "epoch": 0.3752888888888889,
+      "grad_norm": 0.3726405137609307,
+      "learning_rate": 0.00014372897668665476,
+      "loss": 0.6436,
+      "step": 2111
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.35773199192047844,
+      "learning_rate": 0.00014367718609726497,
+      "loss": 0.6119,
+      "step": 2112
+    },
+    {
+      "epoch": 0.37564444444444445,
+      "grad_norm": 0.3958807454635296,
+      "learning_rate": 0.00014362538102663817,
+      "loss": 0.6479,
+      "step": 2113
+    },
+    {
+      "epoch": 0.3758222222222222,
+      "grad_norm": 0.3717928204583553,
+      "learning_rate": 0.00014357356149195043,
+      "loss": 0.6561,
+      "step": 2114
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.37259846302759836,
+      "learning_rate": 0.00014352172751038258,
+      "loss": 0.6284,
+      "step": 2115
+    },
+    {
+      "epoch": 0.3761777777777778,
+      "grad_norm": 0.3803169871031899,
+      "learning_rate": 0.00014346987909912023,
+      "loss": 0.6457,
+      "step": 2116
+    },
+    {
+      "epoch": 0.37635555555555555,
+      "grad_norm": 0.341157191367804,
+      "learning_rate": 0.00014341801627535387,
+      "loss": 0.5963,
+      "step": 2117
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.37635781406654667,
+      "learning_rate": 0.00014336613905627864,
+      "loss": 0.6881,
+      "step": 2118
+    },
+    {
+      "epoch": 0.3767111111111111,
+      "grad_norm": 0.3494390544002213,
+      "learning_rate": 0.00014331424745909455,
+      "loss": 0.6236,
+      "step": 2119
+    },
+    {
+      "epoch": 0.3768888888888889,
+      "grad_norm": 0.3711944336613304,
+      "learning_rate": 0.00014326234150100628,
+      "loss": 0.6847,
+      "step": 2120
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.3639506200310751,
+      "learning_rate": 0.00014321042119922337,
+      "loss": 0.5967,
+      "step": 2121
+    },
+    {
+      "epoch": 0.37724444444444444,
+      "grad_norm": 0.36612709961017464,
+      "learning_rate": 0.00014315848657096004,
+      "loss": 0.6628,
+      "step": 2122
+    },
+    {
+      "epoch": 0.3774222222222222,
+      "grad_norm": 0.37448169012243826,
+      "learning_rate": 0.00014310653763343538,
+      "loss": 0.6286,
+      "step": 2123
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3529198758310332,
+      "learning_rate": 0.00014305457440387306,
+      "loss": 0.5969,
+      "step": 2124
+    },
+    {
+      "epoch": 0.37777777777777777,
+      "grad_norm": 0.3897453826275033,
+      "learning_rate": 0.00014300259689950157,
+      "loss": 0.6221,
+      "step": 2125
+    },
+    {
+      "epoch": 0.37795555555555554,
+      "grad_norm": 0.3600597342924437,
+      "learning_rate": 0.00014295060513755417,
+      "loss": 0.6403,
+      "step": 2126
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.347381650346313,
+      "learning_rate": 0.00014289859913526874,
+      "loss": 0.6284,
+      "step": 2127
+    },
+    {
+      "epoch": 0.3783111111111111,
+      "grad_norm": 0.35067161675773445,
+      "learning_rate": 0.000142846578909888,
+      "loss": 0.6026,
+      "step": 2128
+    },
+    {
+      "epoch": 0.3784888888888889,
+      "grad_norm": 0.352117349469237,
+      "learning_rate": 0.00014279454447865936,
+      "loss": 0.6625,
+      "step": 2129
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.38615121863091334,
+      "learning_rate": 0.00014274249585883488,
+      "loss": 0.6915,
+      "step": 2130
+    },
+    {
+      "epoch": 0.37884444444444443,
+      "grad_norm": 0.34811074245483276,
+      "learning_rate": 0.00014269043306767135,
+      "loss": 0.5989,
+      "step": 2131
+    },
+    {
+      "epoch": 0.3790222222222222,
+      "grad_norm": 0.3761481727509228,
+      "learning_rate": 0.00014263835612243026,
+      "loss": 0.652,
+      "step": 2132
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.33384252861099756,
+      "learning_rate": 0.00014258626504037785,
+      "loss": 0.5909,
+      "step": 2133
+    },
+    {
+      "epoch": 0.37937777777777776,
+      "grad_norm": 0.35198272084894605,
+      "learning_rate": 0.00014253415983878494,
+      "loss": 0.6221,
+      "step": 2134
+    },
+    {
+      "epoch": 0.37955555555555553,
+      "grad_norm": 0.36860438851002236,
+      "learning_rate": 0.0001424820405349271,
+      "loss": 0.6593,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.3647099858235695,
+      "learning_rate": 0.0001424299071460846,
+      "loss": 0.6084,
+      "step": 2136
+    },
+    {
+      "epoch": 0.3799111111111111,
+      "grad_norm": 0.36971936067843375,
+      "learning_rate": 0.00014237775968954232,
+      "loss": 0.5996,
+      "step": 2137
+    },
+    {
+      "epoch": 0.38008888888888887,
+      "grad_norm": 0.40135103166774755,
+      "learning_rate": 0.00014232559818258984,
+      "loss": 0.6435,
+      "step": 2138
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.36258104916378414,
+      "learning_rate": 0.00014227342264252135,
+      "loss": 0.5921,
+      "step": 2139
+    },
+    {
+      "epoch": 0.3804444444444444,
+      "grad_norm": 0.3462943529021952,
+      "learning_rate": 0.00014222123308663576,
+      "loss": 0.6274,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3806222222222222,
+      "grad_norm": 0.3705258541393284,
+      "learning_rate": 0.00014216902953223656,
+      "loss": 0.6067,
+      "step": 2141
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.34363773144042226,
+      "learning_rate": 0.00014211681199663198,
+      "loss": 0.6345,
+      "step": 2142
+    },
+    {
+      "epoch": 0.3809777777777778,
+      "grad_norm": 0.36093738100974226,
+      "learning_rate": 0.00014206458049713478,
+      "loss": 0.5837,
+      "step": 2143
+    },
+    {
+      "epoch": 0.3811555555555556,
+      "grad_norm": 0.3410809396233212,
+      "learning_rate": 0.0001420123350510624,
+      "loss": 0.6162,
+      "step": 2144
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.37523960210836604,
+      "learning_rate": 0.0001419600756757369,
+      "loss": 0.6835,
+      "step": 2145
+    },
+    {
+      "epoch": 0.38151111111111113,
+      "grad_norm": 0.34753360577189735,
+      "learning_rate": 0.00014190780238848493,
+      "loss": 0.6001,
+      "step": 2146
+    },
+    {
+      "epoch": 0.3816888888888889,
+      "grad_norm": 0.3417390143121445,
+      "learning_rate": 0.00014185551520663783,
+      "loss": 0.6318,
+      "step": 2147
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.34767287256511925,
+      "learning_rate": 0.0001418032141475315,
+      "loss": 0.6672,
+      "step": 2148
+    },
+    {
+      "epoch": 0.38204444444444446,
+      "grad_norm": 0.34986866939572314,
+      "learning_rate": 0.00014175089922850633,
+      "loss": 0.6049,
+      "step": 2149
+    },
+    {
+      "epoch": 0.38222222222222224,
+      "grad_norm": 0.38346100398983146,
+      "learning_rate": 0.00014169857046690752,
+      "loss": 0.5949,
+      "step": 2150
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3604941478020668,
+      "learning_rate": 0.0001416462278800847,
+      "loss": 0.6329,
+      "step": 2151
+    },
+    {
+      "epoch": 0.3825777777777778,
+      "grad_norm": 0.37944882343357084,
+      "learning_rate": 0.00014159387148539212,
+      "loss": 0.659,
+      "step": 2152
+    },
+    {
+      "epoch": 0.38275555555555557,
+      "grad_norm": 0.3550831565518237,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.6648,
+      "step": 2153
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.3730216991981197,
+      "learning_rate": 0.00014148911734183773,
+      "loss": 0.7023,
+      "step": 2154
+    },
+    {
+      "epoch": 0.3831111111111111,
+      "grad_norm": 0.3782374506276671,
+      "learning_rate": 0.00014143671962770727,
+      "loss": 0.65,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3832888888888889,
+      "grad_norm": 0.42267071871104744,
+      "learning_rate": 0.00014138430817516989,
+      "loss": 0.6634,
+      "step": 2156
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.34093793843641973,
+      "learning_rate": 0.0001413318830016026,
+      "loss": 0.6557,
+      "step": 2157
+    },
+    {
+      "epoch": 0.38364444444444445,
+      "grad_norm": 0.3507739779657896,
+      "learning_rate": 0.00014127944412438713,
+      "loss": 0.6137,
+      "step": 2158
+    },
+    {
+      "epoch": 0.38382222222222223,
+      "grad_norm": 0.3466109679160134,
+      "learning_rate": 0.00014122699156090963,
+      "loss": 0.6066,
+      "step": 2159
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.41556009016731515,
+      "learning_rate": 0.00014117452532856083,
+      "loss": 0.5822,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3841777777777778,
+      "grad_norm": 0.4202343085679337,
+      "learning_rate": 0.00014112204544473598,
+      "loss": 0.6453,
+      "step": 2161
+    },
+    {
+      "epoch": 0.38435555555555556,
+      "grad_norm": 0.3577389123139154,
+      "learning_rate": 0.00014106955192683487,
+      "loss": 0.6047,
+      "step": 2162
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.374706724519363,
+      "learning_rate": 0.00014101704479226181,
+      "loss": 0.6094,
+      "step": 2163
+    },
+    {
+      "epoch": 0.3847111111111111,
+      "grad_norm": 0.36322802240061863,
+      "learning_rate": 0.0001409645240584256,
+      "loss": 0.5792,
+      "step": 2164
+    },
+    {
+      "epoch": 0.3848888888888889,
+      "grad_norm": 0.3462895298219409,
+      "learning_rate": 0.0001409119897427396,
+      "loss": 0.6522,
+      "step": 2165
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.38805561136927863,
+      "learning_rate": 0.00014085944186262162,
+      "loss": 0.6543,
+      "step": 2166
+    },
+    {
+      "epoch": 0.38524444444444444,
+      "grad_norm": 0.3649822424055061,
+      "learning_rate": 0.00014080688043549398,
+      "loss": 0.6085,
+      "step": 2167
+    },
+    {
+      "epoch": 0.3854222222222222,
+      "grad_norm": 0.35747983341457484,
+      "learning_rate": 0.00014075430547878353,
+      "loss": 0.6194,
+      "step": 2168
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3754182539842769,
+      "learning_rate": 0.0001407017170099216,
+      "loss": 0.6439,
+      "step": 2169
+    },
+    {
+      "epoch": 0.3857777777777778,
+      "grad_norm": 0.3489245454930453,
+      "learning_rate": 0.00014064911504634389,
+      "loss": 0.6373,
+      "step": 2170
+    },
+    {
+      "epoch": 0.38595555555555555,
+      "grad_norm": 0.356052339556721,
+      "learning_rate": 0.0001405964996054907,
+      "loss": 0.6595,
+      "step": 2171
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.382998213017678,
+      "learning_rate": 0.00014054387070480678,
+      "loss": 0.7346,
+      "step": 2172
+    },
+    {
+      "epoch": 0.3863111111111111,
+      "grad_norm": 0.3580304639049787,
+      "learning_rate": 0.00014049122836174135,
+      "loss": 0.6117,
+      "step": 2173
+    },
+    {
+      "epoch": 0.3864888888888889,
+      "grad_norm": 0.36935968443308337,
+      "learning_rate": 0.000140438572593748,
+      "loss": 0.6377,
+      "step": 2174
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.36965018729194526,
+      "learning_rate": 0.00014038590341828485,
+      "loss": 0.6696,
+      "step": 2175
+    },
+    {
+      "epoch": 0.38684444444444444,
+      "grad_norm": 0.36224405451966574,
+      "learning_rate": 0.0001403332208528144,
+      "loss": 0.5822,
+      "step": 2176
+    },
+    {
+      "epoch": 0.3870222222222222,
+      "grad_norm": 0.43704488788052576,
+      "learning_rate": 0.0001402805249148037,
+      "loss": 0.6424,
+      "step": 2177
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.36465651144729866,
+      "learning_rate": 0.00014022781562172417,
+      "loss": 0.6054,
+      "step": 2178
+    },
+    {
+      "epoch": 0.38737777777777777,
+      "grad_norm": 0.39460126513394794,
+      "learning_rate": 0.00014017509299105158,
+      "loss": 0.6311,
+      "step": 2179
+    },
+    {
+      "epoch": 0.38755555555555554,
+      "grad_norm": 0.34746145220952596,
+      "learning_rate": 0.0001401223570402663,
+      "loss": 0.6356,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.3502672643898457,
+      "learning_rate": 0.0001400696077868529,
+      "loss": 0.5963,
+      "step": 2181
+    },
+    {
+      "epoch": 0.3879111111111111,
+      "grad_norm": 0.34879019387944665,
+      "learning_rate": 0.00014001684524830057,
+      "loss": 0.6281,
+      "step": 2182
+    },
+    {
+      "epoch": 0.38808888888888887,
+      "grad_norm": 0.3694671698202079,
+      "learning_rate": 0.00013996406944210277,
+      "loss": 0.6254,
+      "step": 2183
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.36143632327219255,
+      "learning_rate": 0.00013991128038575741,
+      "loss": 0.6224,
+      "step": 2184
+    },
+    {
+      "epoch": 0.3884444444444444,
+      "grad_norm": 0.35475970790390776,
+      "learning_rate": 0.0001398584780967668,
+      "loss": 0.6218,
+      "step": 2185
+    },
+    {
+      "epoch": 0.3886222222222222,
+      "grad_norm": 0.35113464373309267,
+      "learning_rate": 0.00013980566259263756,
+      "loss": 0.6364,
+      "step": 2186
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.38080474460712316,
+      "learning_rate": 0.00013975283389088079,
+      "loss": 0.6519,
+      "step": 2187
+    },
+    {
+      "epoch": 0.38897777777777776,
+      "grad_norm": 0.3387932822864206,
+      "learning_rate": 0.00013969999200901193,
+      "loss": 0.5573,
+      "step": 2188
+    },
+    {
+      "epoch": 0.38915555555555553,
+      "grad_norm": 0.36755095589103015,
+      "learning_rate": 0.00013964713696455074,
+      "loss": 0.6025,
+      "step": 2189
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.37931402832404987,
+      "learning_rate": 0.00013959426877502144,
+      "loss": 0.6475,
+      "step": 2190
+    },
+    {
+      "epoch": 0.3895111111111111,
+      "grad_norm": 0.3662383744919383,
+      "learning_rate": 0.00013954138745795257,
+      "loss": 0.6702,
+      "step": 2191
+    },
+    {
+      "epoch": 0.38968888888888886,
+      "grad_norm": 0.35190299380242607,
+      "learning_rate": 0.00013948849303087698,
+      "loss": 0.6205,
+      "step": 2192
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.36172112011819413,
+      "learning_rate": 0.00013943558551133186,
+      "loss": 0.6431,
+      "step": 2193
+    },
+    {
+      "epoch": 0.39004444444444447,
+      "grad_norm": 0.35138678831696485,
+      "learning_rate": 0.00013938266491685886,
+      "loss": 0.6374,
+      "step": 2194
+    },
+    {
+      "epoch": 0.39022222222222225,
+      "grad_norm": 0.36519395240501246,
+      "learning_rate": 0.0001393297312650038,
+      "loss": 0.6495,
+      "step": 2195
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.521492896553643,
+      "learning_rate": 0.00013927678457331699,
+      "loss": 0.6226,
+      "step": 2196
+    },
+    {
+      "epoch": 0.3905777777777778,
+      "grad_norm": 0.3548497023119579,
+      "learning_rate": 0.00013922382485935297,
+      "loss": 0.6051,
+      "step": 2197
+    },
+    {
+      "epoch": 0.3907555555555556,
+      "grad_norm": 0.38448313075046897,
+      "learning_rate": 0.00013917085214067054,
+      "loss": 0.7214,
+      "step": 2198
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.3687069401932563,
+      "learning_rate": 0.00013911786643483297,
+      "loss": 0.6517,
+      "step": 2199
+    },
+    {
+      "epoch": 0.39111111111111113,
+      "grad_norm": 0.34684049663403577,
+      "learning_rate": 0.0001390648677594077,
+      "loss": 0.6053,
+      "step": 2200
+    },
+    {
+      "epoch": 0.3912888888888889,
+      "grad_norm": 0.35180260381139583,
+      "learning_rate": 0.00013901185613196654,
+      "loss": 0.6239,
+      "step": 2201
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.3698391599021474,
+      "learning_rate": 0.00013895883157008558,
+      "loss": 0.6064,
+      "step": 2202
+    },
+    {
+      "epoch": 0.39164444444444446,
+      "grad_norm": 0.36472959917824455,
+      "learning_rate": 0.00013890579409134518,
+      "loss": 0.6287,
+      "step": 2203
+    },
+    {
+      "epoch": 0.39182222222222224,
+      "grad_norm": 0.38474238140006123,
+      "learning_rate": 0.00013885274371333,
+      "loss": 0.6826,
+      "step": 2204
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.3476018592702132,
+      "learning_rate": 0.00013879968045362901,
+      "loss": 0.6537,
+      "step": 2205
+    },
+    {
+      "epoch": 0.3921777777777778,
+      "grad_norm": 0.38442642410466105,
+      "learning_rate": 0.00013874660432983536,
+      "loss": 0.7407,
+      "step": 2206
+    },
+    {
+      "epoch": 0.39235555555555557,
+      "grad_norm": 0.3620908563106699,
+      "learning_rate": 0.00013869351535954652,
+      "loss": 0.61,
+      "step": 2207
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.36323918214435275,
+      "learning_rate": 0.00013864041356036427,
+      "loss": 0.6035,
+      "step": 2208
+    },
+    {
+      "epoch": 0.3927111111111111,
+      "grad_norm": 0.3465735449833905,
+      "learning_rate": 0.00013858729894989456,
+      "loss": 0.6079,
+      "step": 2209
+    },
+    {
+      "epoch": 0.3928888888888889,
+      "grad_norm": 0.3508195670082517,
+      "learning_rate": 0.0001385341715457476,
+      "loss": 0.6149,
+      "step": 2210
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.3649447656788205,
+      "learning_rate": 0.00013848103136553788,
+      "loss": 0.6607,
+      "step": 2211
+    },
+    {
+      "epoch": 0.39324444444444445,
+      "grad_norm": 0.32540805810766693,
+      "learning_rate": 0.00013842787842688412,
+      "loss": 0.5757,
+      "step": 2212
+    },
+    {
+      "epoch": 0.39342222222222223,
+      "grad_norm": 0.37045526111195654,
+      "learning_rate": 0.00013837471274740924,
+      "loss": 0.6543,
+      "step": 2213
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3878579193509301,
+      "learning_rate": 0.0001383215343447404,
+      "loss": 0.6754,
+      "step": 2214
+    },
+    {
+      "epoch": 0.3937777777777778,
+      "grad_norm": 0.3667130246029731,
+      "learning_rate": 0.000138268343236509,
+      "loss": 0.6314,
+      "step": 2215
+    },
+    {
+      "epoch": 0.39395555555555556,
+      "grad_norm": 0.35799672474178074,
+      "learning_rate": 0.0001382151394403506,
+      "loss": 0.6496,
+      "step": 2216
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.41999942376969673,
+      "learning_rate": 0.00013816192297390502,
+      "loss": 0.6045,
+      "step": 2217
+    },
+    {
+      "epoch": 0.3943111111111111,
+      "grad_norm": 0.3994431684518544,
+      "learning_rate": 0.00013810869385481623,
+      "loss": 0.7035,
+      "step": 2218
+    },
+    {
+      "epoch": 0.3944888888888889,
+      "grad_norm": 0.3711501205407042,
+      "learning_rate": 0.0001380554521007325,
+      "loss": 0.6001,
+      "step": 2219
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.3453335682249666,
+      "learning_rate": 0.00013800219772930612,
+      "loss": 0.5978,
+      "step": 2220
+    },
+    {
+      "epoch": 0.39484444444444444,
+      "grad_norm": 0.3660366715776582,
+      "learning_rate": 0.00013794893075819373,
+      "loss": 0.6391,
+      "step": 2221
+    },
+    {
+      "epoch": 0.3950222222222222,
+      "grad_norm": 0.5015331879628174,
+      "learning_rate": 0.00013789565120505607,
+      "loss": 0.7131,
+      "step": 2222
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.35333002194112134,
+      "learning_rate": 0.000137842359087558,
+      "loss": 0.641,
+      "step": 2223
+    },
+    {
+      "epoch": 0.3953777777777778,
+      "grad_norm": 0.3564476048752636,
+      "learning_rate": 0.00013778905442336865,
+      "loss": 0.6278,
+      "step": 2224
+    },
+    {
+      "epoch": 0.39555555555555555,
+      "grad_norm": 0.35276632598806756,
+      "learning_rate": 0.00013773573723016122,
+      "loss": 0.6133,
+      "step": 2225
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.37797619461352133,
+      "learning_rate": 0.00013768240752561314,
+      "loss": 0.6732,
+      "step": 2226
+    },
+    {
+      "epoch": 0.3959111111111111,
+      "grad_norm": 0.3783009340377197,
+      "learning_rate": 0.00013762906532740595,
+      "loss": 0.6743,
+      "step": 2227
+    },
+    {
+      "epoch": 0.3960888888888889,
+      "grad_norm": 0.3765372253957615,
+      "learning_rate": 0.00013757571065322534,
+      "loss": 0.6255,
+      "step": 2228
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.35524234346994,
+      "learning_rate": 0.00013752234352076116,
+      "loss": 0.6157,
+      "step": 2229
+    },
+    {
+      "epoch": 0.39644444444444443,
+      "grad_norm": 0.345037872312445,
+      "learning_rate": 0.00013746896394770727,
+      "loss": 0.5931,
+      "step": 2230
+    },
+    {
+      "epoch": 0.3966222222222222,
+      "grad_norm": 0.36783560622610667,
+      "learning_rate": 0.00013741557195176183,
+      "loss": 0.6063,
+      "step": 2231
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3729745411551487,
+      "learning_rate": 0.000137362167550627,
+      "loss": 0.6541,
+      "step": 2232
+    },
+    {
+      "epoch": 0.39697777777777776,
+      "grad_norm": 0.3654039684684545,
+      "learning_rate": 0.00013730875076200914,
+      "loss": 0.6036,
+      "step": 2233
+    },
+    {
+      "epoch": 0.39715555555555554,
+      "grad_norm": 0.35109694080636267,
+      "learning_rate": 0.00013725532160361863,
+      "loss": 0.6297,
+      "step": 2234
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.36470389923261654,
+      "learning_rate": 0.00013720188009316996,
+      "loss": 0.6295,
+      "step": 2235
+    },
+    {
+      "epoch": 0.3975111111111111,
+      "grad_norm": 0.37171084612308297,
+      "learning_rate": 0.00013714842624838177,
+      "loss": 0.6561,
+      "step": 2236
+    },
+    {
+      "epoch": 0.39768888888888887,
+      "grad_norm": 0.41671806828699953,
+      "learning_rate": 0.0001370949600869768,
+      "loss": 0.6518,
+      "step": 2237
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.3744424006821874,
+      "learning_rate": 0.00013704148162668178,
+      "loss": 0.5946,
+      "step": 2238
+    },
+    {
+      "epoch": 0.3980444444444444,
+      "grad_norm": 0.3999851178907819,
+      "learning_rate": 0.00013698799088522758,
+      "loss": 0.6594,
+      "step": 2239
+    },
+    {
+      "epoch": 0.3982222222222222,
+      "grad_norm": 0.35331865769428233,
+      "learning_rate": 0.00013693448788034917,
+      "loss": 0.5764,
+      "step": 2240
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.38066216369092526,
+      "learning_rate": 0.00013688097262978555,
+      "loss": 0.6521,
+      "step": 2241
+    },
+    {
+      "epoch": 0.39857777777777775,
+      "grad_norm": 0.3572870438883799,
+      "learning_rate": 0.00013682744515127975,
+      "loss": 0.6054,
+      "step": 2242
+    },
+    {
+      "epoch": 0.39875555555555553,
+      "grad_norm": 0.35907742570038703,
+      "learning_rate": 0.0001367739054625789,
+      "loss": 0.6397,
+      "step": 2243
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.41714292789546137,
+      "learning_rate": 0.00013672035358143418,
+      "loss": 0.6565,
+      "step": 2244
+    },
+    {
+      "epoch": 0.39911111111111114,
+      "grad_norm": 0.405625896759291,
+      "learning_rate": 0.00013666678952560076,
+      "loss": 0.6559,
+      "step": 2245
+    },
+    {
+      "epoch": 0.3992888888888889,
+      "grad_norm": 0.39412618111010667,
+      "learning_rate": 0.00013661321331283796,
+      "loss": 0.653,
+      "step": 2246
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.37103779656489266,
+      "learning_rate": 0.00013655962496090894,
+      "loss": 0.6572,
+      "step": 2247
+    },
+    {
+      "epoch": 0.39964444444444447,
+      "grad_norm": 0.37300796764286714,
+      "learning_rate": 0.00013650602448758112,
+      "loss": 0.6237,
+      "step": 2248
+    },
+    {
+      "epoch": 0.39982222222222225,
+      "grad_norm": 0.37027085419069994,
+      "learning_rate": 0.0001364524119106257,
+      "loss": 0.6453,
+      "step": 2249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3763403156025713,
+      "learning_rate": 0.00013639878724781813,
+      "loss": 0.5964,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4001777777777778,
+      "grad_norm": 0.3637796464262999,
+      "learning_rate": 0.00013634515051693766,
+      "loss": 0.6352,
+      "step": 2251
+    },
+    {
+      "epoch": 0.4003555555555556,
+      "grad_norm": 0.3520726300177121,
+      "learning_rate": 0.00013629150173576762,
+      "loss": 0.5829,
+      "step": 2252
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.36119202828985025,
+      "learning_rate": 0.00013623784092209543,
+      "loss": 0.6425,
+      "step": 2253
+    },
+    {
+      "epoch": 0.40071111111111113,
+      "grad_norm": 0.37049771885488864,
+      "learning_rate": 0.00013618416809371237,
+      "loss": 0.6921,
+      "step": 2254
+    },
+    {
+      "epoch": 0.4008888888888889,
+      "grad_norm": 0.37393697448475316,
+      "learning_rate": 0.00013613048326841372,
+      "loss": 0.6195,
+      "step": 2255
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.34601021032042534,
+      "learning_rate": 0.0001360767864639988,
+      "loss": 0.6378,
+      "step": 2256
+    },
+    {
+      "epoch": 0.40124444444444446,
+      "grad_norm": 0.3408026567985287,
+      "learning_rate": 0.00013602307769827084,
+      "loss": 0.6105,
+      "step": 2257
+    },
+    {
+      "epoch": 0.40142222222222224,
+      "grad_norm": 0.36604065559847915,
+      "learning_rate": 0.0001359693569890371,
+      "loss": 0.6153,
+      "step": 2258
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.39299322092358724,
+      "learning_rate": 0.0001359156243541087,
+      "loss": 0.6807,
+      "step": 2259
+    },
+    {
+      "epoch": 0.4017777777777778,
+      "grad_norm": 0.3560902742072705,
+      "learning_rate": 0.00013586187981130086,
+      "loss": 0.6425,
+      "step": 2260
+    },
+    {
+      "epoch": 0.40195555555555557,
+      "grad_norm": 0.39317614378135457,
+      "learning_rate": 0.00013580812337843262,
+      "loss": 0.6101,
+      "step": 2261
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.3843783502794162,
+      "learning_rate": 0.00013575435507332697,
+      "loss": 0.6108,
+      "step": 2262
+    },
+    {
+      "epoch": 0.4023111111111111,
+      "grad_norm": 0.3688348767071207,
+      "learning_rate": 0.0001357005749138109,
+      "loss": 0.6007,
+      "step": 2263
+    },
+    {
+      "epoch": 0.4024888888888889,
+      "grad_norm": 0.3680543503224331,
+      "learning_rate": 0.00013564678291771534,
+      "loss": 0.6601,
+      "step": 2264
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.37164803022363313,
+      "learning_rate": 0.00013559297910287508,
+      "loss": 0.6081,
+      "step": 2265
+    },
+    {
+      "epoch": 0.40284444444444445,
+      "grad_norm": 0.39621972978569014,
+      "learning_rate": 0.00013553916348712884,
+      "loss": 0.6391,
+      "step": 2266
+    },
+    {
+      "epoch": 0.4030222222222222,
+      "grad_norm": 0.3689725631616896,
+      "learning_rate": 0.0001354853360883193,
+      "loss": 0.6147,
+      "step": 2267
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.36287847111403354,
+      "learning_rate": 0.000135431496924293,
+      "loss": 0.5916,
+      "step": 2268
+    },
+    {
+      "epoch": 0.4033777777777778,
+      "grad_norm": 0.3658611910294385,
+      "learning_rate": 0.00013537764601290037,
+      "loss": 0.636,
+      "step": 2269
+    },
+    {
+      "epoch": 0.40355555555555556,
+      "grad_norm": 0.35927960370283263,
+      "learning_rate": 0.00013532378337199582,
+      "loss": 0.6168,
+      "step": 2270
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.3471200168272243,
+      "learning_rate": 0.00013526990901943756,
+      "loss": 0.6227,
+      "step": 2271
+    },
+    {
+      "epoch": 0.4039111111111111,
+      "grad_norm": 0.36451245300985974,
+      "learning_rate": 0.0001352160229730877,
+      "loss": 0.6878,
+      "step": 2272
+    },
+    {
+      "epoch": 0.4040888888888889,
+      "grad_norm": 0.36247825343298284,
+      "learning_rate": 0.00013516212525081222,
+      "loss": 0.6112,
+      "step": 2273
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.33912327267062775,
+      "learning_rate": 0.00013510821587048107,
+      "loss": 0.5718,
+      "step": 2274
+    },
+    {
+      "epoch": 0.40444444444444444,
+      "grad_norm": 0.3534102851106006,
+      "learning_rate": 0.00013505429484996788,
+      "loss": 0.6049,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4046222222222222,
+      "grad_norm": 0.36210789811782823,
+      "learning_rate": 0.00013500036220715034,
+      "loss": 0.6086,
+      "step": 2276
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.3610783069549878,
+      "learning_rate": 0.00013494641795990986,
+      "loss": 0.6525,
+      "step": 2277
+    },
+    {
+      "epoch": 0.40497777777777777,
+      "grad_norm": 0.3456435218318268,
+      "learning_rate": 0.00013489246212613172,
+      "loss": 0.6235,
+      "step": 2278
+    },
+    {
+      "epoch": 0.40515555555555555,
+      "grad_norm": 0.3491123928818089,
+      "learning_rate": 0.0001348384947237051,
+      "loss": 0.6133,
+      "step": 2279
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.4347292146868507,
+      "learning_rate": 0.00013478451577052293,
+      "loss": 0.6013,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4055111111111111,
+      "grad_norm": 0.3521540151615664,
+      "learning_rate": 0.00013473052528448201,
+      "loss": 0.6306,
+      "step": 2281
+    },
+    {
+      "epoch": 0.4056888888888889,
+      "grad_norm": 0.3657452480559006,
+      "learning_rate": 0.00013467652328348306,
+      "loss": 0.6575,
+      "step": 2282
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.377056831347542,
+      "learning_rate": 0.00013462250978543044,
+      "loss": 0.6526,
+      "step": 2283
+    },
+    {
+      "epoch": 0.40604444444444443,
+      "grad_norm": 0.35899665506459566,
+      "learning_rate": 0.00013456848480823238,
+      "loss": 0.6077,
+      "step": 2284
+    },
+    {
+      "epoch": 0.4062222222222222,
+      "grad_norm": 0.3489345735334524,
+      "learning_rate": 0.000134514448369801,
+      "loss": 0.6295,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.386870182272734,
+      "learning_rate": 0.00013446040048805218,
+      "loss": 0.6722,
+      "step": 2286
+    },
+    {
+      "epoch": 0.40657777777777776,
+      "grad_norm": 0.3520920670584741,
+      "learning_rate": 0.00013440634118090552,
+      "loss": 0.594,
+      "step": 2287
+    },
+    {
+      "epoch": 0.40675555555555554,
+      "grad_norm": 0.3456438821828521,
+      "learning_rate": 0.0001343522704662845,
+      "loss": 0.5906,
+      "step": 2288
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.37167197979109773,
+      "learning_rate": 0.0001342981883621163,
+      "loss": 0.6793,
+      "step": 2289
+    },
+    {
+      "epoch": 0.4071111111111111,
+      "grad_norm": 0.3704497625024607,
+      "learning_rate": 0.000134244094886332,
+      "loss": 0.69,
+      "step": 2290
+    },
+    {
+      "epoch": 0.40728888888888887,
+      "grad_norm": 0.35927984762132265,
+      "learning_rate": 0.00013418999005686635,
+      "loss": 0.6429,
+      "step": 2291
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.4003445829647246,
+      "learning_rate": 0.00013413587389165784,
+      "loss": 0.6859,
+      "step": 2292
+    },
+    {
+      "epoch": 0.4076444444444444,
+      "grad_norm": 0.36165890050622507,
+      "learning_rate": 0.0001340817464086488,
+      "loss": 0.6344,
+      "step": 2293
+    },
+    {
+      "epoch": 0.4078222222222222,
+      "grad_norm": 0.38020801064425364,
+      "learning_rate": 0.00013402760762578527,
+      "loss": 0.6616,
+      "step": 2294
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.345563933181974,
+      "learning_rate": 0.00013397345756101708,
+      "loss": 0.6557,
+      "step": 2295
+    },
+    {
+      "epoch": 0.40817777777777775,
+      "grad_norm": 0.34364179737224065,
+      "learning_rate": 0.00013391929623229773,
+      "loss": 0.6098,
+      "step": 2296
+    },
+    {
+      "epoch": 0.4083555555555556,
+      "grad_norm": 0.36496273070797147,
+      "learning_rate": 0.0001338651236575845,
+      "loss": 0.6298,
+      "step": 2297
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.3367366001547099,
+      "learning_rate": 0.00013381093985483837,
+      "loss": 0.6016,
+      "step": 2298
+    },
+    {
+      "epoch": 0.40871111111111114,
+      "grad_norm": 0.33798538043977244,
+      "learning_rate": 0.0001337567448420241,
+      "loss": 0.5834,
+      "step": 2299
+    },
+    {
+      "epoch": 0.4088888888888889,
+      "grad_norm": 0.3486084067281326,
+      "learning_rate": 0.00013370253863711007,
+      "loss": 0.6306,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.3327238937777495,
+      "learning_rate": 0.0001336483212580685,
+      "loss": 0.5926,
+      "step": 2301
+    },
+    {
+      "epoch": 0.40924444444444447,
+      "grad_norm": 0.3723619122906232,
+      "learning_rate": 0.0001335940927228752,
+      "loss": 0.6631,
+      "step": 2302
+    },
+    {
+      "epoch": 0.40942222222222224,
+      "grad_norm": 0.3569229877435541,
+      "learning_rate": 0.00013353985304950973,
+      "loss": 0.6086,
+      "step": 2303
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3825508026363746,
+      "learning_rate": 0.00013348560225595534,
+      "loss": 0.6668,
+      "step": 2304
+    },
+    {
+      "epoch": 0.4097777777777778,
+      "grad_norm": 0.3259140115763881,
+      "learning_rate": 0.00013343134036019895,
+      "loss": 0.5879,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4099555555555556,
+      "grad_norm": 0.3696174236507997,
+      "learning_rate": 0.0001333770673802312,
+      "loss": 0.6326,
+      "step": 2306
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.3726843674946877,
+      "learning_rate": 0.00013332278333404637,
+      "loss": 0.67,
+      "step": 2307
+    },
+    {
+      "epoch": 0.4103111111111111,
+      "grad_norm": 0.360137440913349,
+      "learning_rate": 0.00013326848823964243,
+      "loss": 0.6237,
+      "step": 2308
+    },
+    {
+      "epoch": 0.4104888888888889,
+      "grad_norm": 0.3538565241839011,
+      "learning_rate": 0.00013321418211502091,
+      "loss": 0.6366,
+      "step": 2309
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.36794012133754433,
+      "learning_rate": 0.0001331598649781872,
+      "loss": 0.6771,
+      "step": 2310
+    },
+    {
+      "epoch": 0.41084444444444446,
+      "grad_norm": 0.3660473619849432,
+      "learning_rate": 0.0001331055368471502,
+      "loss": 0.6541,
+      "step": 2311
+    },
+    {
+      "epoch": 0.41102222222222223,
+      "grad_norm": 0.34333918895498594,
+      "learning_rate": 0.00013305119773992247,
+      "loss": 0.618,
+      "step": 2312
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.34432545065198905,
+      "learning_rate": 0.0001329968476745202,
+      "loss": 0.6535,
+      "step": 2313
+    },
+    {
+      "epoch": 0.4113777777777778,
+      "grad_norm": 0.3499355029370304,
+      "learning_rate": 0.00013294248666896328,
+      "loss": 0.6242,
+      "step": 2314
+    },
+    {
+      "epoch": 0.41155555555555556,
+      "grad_norm": 0.3516993466408435,
+      "learning_rate": 0.00013288811474127516,
+      "loss": 0.61,
+      "step": 2315
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.33801311156422703,
+      "learning_rate": 0.00013283373190948295,
+      "loss": 0.6399,
+      "step": 2316
+    },
+    {
+      "epoch": 0.4119111111111111,
+      "grad_norm": 0.3453902813352448,
+      "learning_rate": 0.0001327793381916173,
+      "loss": 0.6186,
+      "step": 2317
+    },
+    {
+      "epoch": 0.4120888888888889,
+      "grad_norm": 0.38208799243876135,
+      "learning_rate": 0.00013272493360571262,
+      "loss": 0.6297,
+      "step": 2318
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.357668395257919,
+      "learning_rate": 0.0001326705181698068,
+      "loss": 0.6312,
+      "step": 2319
+    },
+    {
+      "epoch": 0.41244444444444445,
+      "grad_norm": 0.3665682879377501,
+      "learning_rate": 0.00013261609190194136,
+      "loss": 0.6348,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4126222222222222,
+      "grad_norm": 0.3551999719409832,
+      "learning_rate": 0.00013256165482016137,
+      "loss": 0.6634,
+      "step": 2321
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3686600288194294,
+      "learning_rate": 0.00013250720694251556,
+      "loss": 0.6128,
+      "step": 2322
+    },
+    {
+      "epoch": 0.4129777777777778,
+      "grad_norm": 0.35572854169481694,
+      "learning_rate": 0.0001324527482870562,
+      "loss": 0.6183,
+      "step": 2323
+    },
+    {
+      "epoch": 0.41315555555555555,
+      "grad_norm": 0.34935095978295044,
+      "learning_rate": 0.00013239827887183916,
+      "loss": 0.6093,
+      "step": 2324
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.36261703008313345,
+      "learning_rate": 0.0001323437987149238,
+      "loss": 0.6204,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4135111111111111,
+      "grad_norm": 0.3459300513653912,
+      "learning_rate": 0.0001322893078343732,
+      "loss": 0.6186,
+      "step": 2326
+    },
+    {
+      "epoch": 0.4136888888888889,
+      "grad_norm": 0.34976199172081734,
+      "learning_rate": 0.0001322348062482538,
+      "loss": 0.6479,
+      "step": 2327
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.348935342106647,
+      "learning_rate": 0.0001321802939746357,
+      "loss": 0.6053,
+      "step": 2328
+    },
+    {
+      "epoch": 0.41404444444444444,
+      "grad_norm": 0.37180153525586357,
+      "learning_rate": 0.00013212577103159258,
+      "loss": 0.6899,
+      "step": 2329
+    },
+    {
+      "epoch": 0.4142222222222222,
+      "grad_norm": 0.3293784470438587,
+      "learning_rate": 0.00013207123743720156,
+      "loss": 0.5666,
+      "step": 2330
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.3268395656725775,
+      "learning_rate": 0.00013201669320954333,
+      "loss": 0.621,
+      "step": 2331
+    },
+    {
+      "epoch": 0.41457777777777777,
+      "grad_norm": 0.34569902660741436,
+      "learning_rate": 0.00013196213836670214,
+      "loss": 0.6676,
+      "step": 2332
+    },
+    {
+      "epoch": 0.41475555555555554,
+      "grad_norm": 0.3356318153386259,
+      "learning_rate": 0.0001319075729267657,
+      "loss": 0.6,
+      "step": 2333
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.34386109306069956,
+      "learning_rate": 0.00013185299690782526,
+      "loss": 0.6226,
+      "step": 2334
+    },
+    {
+      "epoch": 0.4151111111111111,
+      "grad_norm": 0.3623760791532519,
+      "learning_rate": 0.00013179841032797565,
+      "loss": 0.6634,
+      "step": 2335
+    },
+    {
+      "epoch": 0.4152888888888889,
+      "grad_norm": 0.3757665488245912,
+      "learning_rate": 0.00013174381320531505,
+      "loss": 0.6094,
+      "step": 2336
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.3481347352138883,
+      "learning_rate": 0.00013168920555794525,
+      "loss": 0.654,
+      "step": 2337
+    },
+    {
+      "epoch": 0.4156444444444444,
+      "grad_norm": 0.3452942094953099,
+      "learning_rate": 0.00013163458740397149,
+      "loss": 0.623,
+      "step": 2338
+    },
+    {
+      "epoch": 0.4158222222222222,
+      "grad_norm": 0.3891500238233552,
+      "learning_rate": 0.0001315799587615025,
+      "loss": 0.6763,
+      "step": 2339
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.37927359653513915,
+      "learning_rate": 0.00013152531964865052,
+      "loss": 0.6334,
+      "step": 2340
+    },
+    {
+      "epoch": 0.41617777777777776,
+      "grad_norm": 0.3659354644333141,
+      "learning_rate": 0.0001314706700835312,
+      "loss": 0.6467,
+      "step": 2341
+    },
+    {
+      "epoch": 0.41635555555555553,
+      "grad_norm": 0.3602640145481248,
+      "learning_rate": 0.00013141601008426372,
+      "loss": 0.6565,
+      "step": 2342
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.3573937935188667,
+      "learning_rate": 0.00013136133966897064,
+      "loss": 0.6413,
+      "step": 2343
+    },
+    {
+      "epoch": 0.4167111111111111,
+      "grad_norm": 0.3399284147387536,
+      "learning_rate": 0.00013130665885577805,
+      "loss": 0.6374,
+      "step": 2344
+    },
+    {
+      "epoch": 0.41688888888888886,
+      "grad_norm": 0.3646181995748407,
+      "learning_rate": 0.00013125196766281544,
+      "loss": 0.6584,
+      "step": 2345
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.35329182669007025,
+      "learning_rate": 0.00013119726610821576,
+      "loss": 0.645,
+      "step": 2346
+    },
+    {
+      "epoch": 0.4172444444444444,
+      "grad_norm": 0.3704745300532282,
+      "learning_rate": 0.0001311425542101154,
+      "loss": 0.638,
+      "step": 2347
+    },
+    {
+      "epoch": 0.4174222222222222,
+      "grad_norm": 0.36525748658024687,
+      "learning_rate": 0.00013108783198665416,
+      "loss": 0.6985,
+      "step": 2348
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.35462538691231243,
+      "learning_rate": 0.0001310330994559753,
+      "loss": 0.6251,
+      "step": 2349
+    },
+    {
+      "epoch": 0.4177777777777778,
+      "grad_norm": 0.3269515104611661,
+      "learning_rate": 0.00013097835663622545,
+      "loss": 0.5992,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4179555555555556,
+      "grad_norm": 0.3640316349009023,
+      "learning_rate": 0.00013092360354555467,
+      "loss": 0.5984,
+      "step": 2351
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.37482757256302207,
+      "learning_rate": 0.00013086884020211645,
+      "loss": 0.6042,
+      "step": 2352
+    },
+    {
+      "epoch": 0.41831111111111113,
+      "grad_norm": 0.3805531636261812,
+      "learning_rate": 0.00013081406662406763,
+      "loss": 0.653,
+      "step": 2353
+    },
+    {
+      "epoch": 0.4184888888888889,
+      "grad_norm": 0.3692067429214998,
+      "learning_rate": 0.00013075928282956853,
+      "loss": 0.6345,
+      "step": 2354
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.3677934881832081,
+      "learning_rate": 0.00013070448883678275,
+      "loss": 0.6303,
+      "step": 2355
+    },
+    {
+      "epoch": 0.41884444444444446,
+      "grad_norm": 0.3861103220341804,
+      "learning_rate": 0.0001306496846638773,
+      "loss": 0.661,
+      "step": 2356
+    },
+    {
+      "epoch": 0.41902222222222224,
+      "grad_norm": 0.37324704760877364,
+      "learning_rate": 0.00013059487032902268,
+      "loss": 0.6391,
+      "step": 2357
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3506326891064393,
+      "learning_rate": 0.00013054004585039258,
+      "loss": 0.6097,
+      "step": 2358
+    },
+    {
+      "epoch": 0.4193777777777778,
+      "grad_norm": 0.37755571556437467,
+      "learning_rate": 0.00013048521124616418,
+      "loss": 0.682,
+      "step": 2359
+    },
+    {
+      "epoch": 0.41955555555555557,
+      "grad_norm": 0.35562443904162094,
+      "learning_rate": 0.00013043036653451794,
+      "loss": 0.6072,
+      "step": 2360
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.34694886253349744,
+      "learning_rate": 0.00013037551173363774,
+      "loss": 0.6163,
+      "step": 2361
+    },
+    {
+      "epoch": 0.4199111111111111,
+      "grad_norm": 0.3578936885901449,
+      "learning_rate": 0.00013032064686171075,
+      "loss": 0.614,
+      "step": 2362
+    },
+    {
+      "epoch": 0.4200888888888889,
+      "grad_norm": 0.356159775126546,
+      "learning_rate": 0.0001302657719369275,
+      "loss": 0.6326,
+      "step": 2363
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.36086237377679226,
+      "learning_rate": 0.0001302108869774819,
+      "loss": 0.692,
+      "step": 2364
+    },
+    {
+      "epoch": 0.42044444444444445,
+      "grad_norm": 0.374333320230105,
+      "learning_rate": 0.00013015599200157107,
+      "loss": 0.6438,
+      "step": 2365
+    },
+    {
+      "epoch": 0.42062222222222223,
+      "grad_norm": 0.35999442735301823,
+      "learning_rate": 0.00013010108702739558,
+      "loss": 0.6396,
+      "step": 2366
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.37599794992759844,
+      "learning_rate": 0.00013004617207315922,
+      "loss": 0.6418,
+      "step": 2367
+    },
+    {
+      "epoch": 0.4209777777777778,
+      "grad_norm": 0.3519403982606927,
+      "learning_rate": 0.00012999124715706915,
+      "loss": 0.6289,
+      "step": 2368
+    },
+    {
+      "epoch": 0.42115555555555556,
+      "grad_norm": 0.3369286143577679,
+      "learning_rate": 0.00012993631229733582,
+      "loss": 0.5936,
+      "step": 2369
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.34394477663961104,
+      "learning_rate": 0.00012988136751217291,
+      "loss": 0.5947,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4215111111111111,
+      "grad_norm": 0.3733845858367674,
+      "learning_rate": 0.0001298264128197975,
+      "loss": 0.5753,
+      "step": 2371
+    },
+    {
+      "epoch": 0.4216888888888889,
+      "grad_norm": 0.3591871915226606,
+      "learning_rate": 0.0001297714482384299,
+      "loss": 0.6199,
+      "step": 2372
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.35250390532830783,
+      "learning_rate": 0.00012971647378629366,
+      "loss": 0.6679,
+      "step": 2373
+    },
+    {
+      "epoch": 0.42204444444444444,
+      "grad_norm": 0.3553128954348677,
+      "learning_rate": 0.00012966148948161569,
+      "loss": 0.656,
+      "step": 2374
+    },
+    {
+      "epoch": 0.4222222222222222,
+      "grad_norm": 0.36651801744600043,
+      "learning_rate": 0.00012960649534262607,
+      "loss": 0.5725,
+      "step": 2375
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.36551793616827166,
+      "learning_rate": 0.00012955149138755821,
+      "loss": 0.6557,
+      "step": 2376
+    },
+    {
+      "epoch": 0.4225777777777778,
+      "grad_norm": 0.34508972530139137,
+      "learning_rate": 0.0001294964776346488,
+      "loss": 0.6181,
+      "step": 2377
+    },
+    {
+      "epoch": 0.42275555555555555,
+      "grad_norm": 0.3654008193872436,
+      "learning_rate": 0.00012944145410213764,
+      "loss": 0.6579,
+      "step": 2378
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.35230954792235686,
+      "learning_rate": 0.00012938642080826795,
+      "loss": 0.6206,
+      "step": 2379
+    },
+    {
+      "epoch": 0.4231111111111111,
+      "grad_norm": 0.36609973543745744,
+      "learning_rate": 0.00012933137777128607,
+      "loss": 0.5961,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4232888888888889,
+      "grad_norm": 0.35617888987419094,
+      "learning_rate": 0.00012927632500944161,
+      "loss": 0.6365,
+      "step": 2381
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.36093591553360777,
+      "learning_rate": 0.00012922126254098735,
+      "loss": 0.6055,
+      "step": 2382
+    },
+    {
+      "epoch": 0.42364444444444443,
+      "grad_norm": 0.3344674802417674,
+      "learning_rate": 0.0001291661903841794,
+      "loss": 0.6264,
+      "step": 2383
+    },
+    {
+      "epoch": 0.4238222222222222,
+      "grad_norm": 0.34276341314754455,
+      "learning_rate": 0.000129111108557277,
+      "loss": 0.6052,
+      "step": 2384
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.39669484073720285,
+      "learning_rate": 0.00012905601707854255,
+      "loss": 0.6882,
+      "step": 2385
+    },
+    {
+      "epoch": 0.42417777777777776,
+      "grad_norm": 0.3754941260882933,
+      "learning_rate": 0.0001290009159662418,
+      "loss": 0.6323,
+      "step": 2386
+    },
+    {
+      "epoch": 0.42435555555555554,
+      "grad_norm": 0.35504806432726915,
+      "learning_rate": 0.00012894580523864358,
+      "loss": 0.6488,
+      "step": 2387
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.35310960734760505,
+      "learning_rate": 0.0001288906849140199,
+      "loss": 0.6,
+      "step": 2388
+    },
+    {
+      "epoch": 0.4247111111111111,
+      "grad_norm": 0.36621650743851425,
+      "learning_rate": 0.00012883555501064603,
+      "loss": 0.6616,
+      "step": 2389
+    },
+    {
+      "epoch": 0.42488888888888887,
+      "grad_norm": 0.3656296247330148,
+      "learning_rate": 0.0001287804155468004,
+      "loss": 0.6211,
+      "step": 2390
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.3627491764165094,
+      "learning_rate": 0.0001287252665407645,
+      "loss": 0.5853,
+      "step": 2391
+    },
+    {
+      "epoch": 0.4252444444444444,
+      "grad_norm": 0.36252998478817616,
+      "learning_rate": 0.00012867010801082308,
+      "loss": 0.5652,
+      "step": 2392
+    },
+    {
+      "epoch": 0.4254222222222222,
+      "grad_norm": 0.3779783758073857,
+      "learning_rate": 0.0001286149399752641,
+      "loss": 0.6263,
+      "step": 2393
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.600048124131751,
+      "learning_rate": 0.00012855976245237854,
+      "loss": 0.6255,
+      "step": 2394
+    },
+    {
+      "epoch": 0.42577777777777776,
+      "grad_norm": 0.3981944200306016,
+      "learning_rate": 0.00012850457546046063,
+      "loss": 0.7123,
+      "step": 2395
+    },
+    {
+      "epoch": 0.42595555555555553,
+      "grad_norm": 0.34499464572562755,
+      "learning_rate": 0.00012844937901780766,
+      "loss": 0.6232,
+      "step": 2396
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.35532483125508896,
+      "learning_rate": 0.00012839417314272015,
+      "loss": 0.5934,
+      "step": 2397
+    },
+    {
+      "epoch": 0.4263111111111111,
+      "grad_norm": 0.34412101816347485,
+      "learning_rate": 0.00012833895785350165,
+      "loss": 0.5892,
+      "step": 2398
+    },
+    {
+      "epoch": 0.42648888888888886,
+      "grad_norm": 0.3538756389735311,
+      "learning_rate": 0.00012828373316845886,
+      "loss": 0.6319,
+      "step": 2399
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.34845042454957637,
+      "learning_rate": 0.00012822849910590166,
+      "loss": 0.5963,
+      "step": 2400
+    },
+    {
+      "epoch": 0.42684444444444447,
+      "grad_norm": 0.3620431462516756,
+      "learning_rate": 0.00012817325568414297,
+      "loss": 0.6437,
+      "step": 2401
+    },
+    {
+      "epoch": 0.42702222222222225,
+      "grad_norm": 0.3659724988605635,
+      "learning_rate": 0.0001281180029214988,
+      "loss": 0.6484,
+      "step": 2402
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.36758670904619567,
+      "learning_rate": 0.00012806274083628833,
+      "loss": 0.5708,
+      "step": 2403
+    },
+    {
+      "epoch": 0.4273777777777778,
+      "grad_norm": 0.3473949133208065,
+      "learning_rate": 0.00012800746944683372,
+      "loss": 0.6593,
+      "step": 2404
+    },
+    {
+      "epoch": 0.4275555555555556,
+      "grad_norm": 0.3676037859972472,
+      "learning_rate": 0.00012795218877146035,
+      "loss": 0.6896,
+      "step": 2405
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.34847896656952704,
+      "learning_rate": 0.00012789689882849659,
+      "loss": 0.6343,
+      "step": 2406
+    },
+    {
+      "epoch": 0.42791111111111113,
+      "grad_norm": 0.37702515284495397,
+      "learning_rate": 0.0001278415996362739,
+      "loss": 0.6818,
+      "step": 2407
+    },
+    {
+      "epoch": 0.4280888888888889,
+      "grad_norm": 0.36710909666073355,
+      "learning_rate": 0.0001277862912131268,
+      "loss": 0.6272,
+      "step": 2408
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.3663485642828436,
+      "learning_rate": 0.00012773097357739288,
+      "loss": 0.6465,
+      "step": 2409
+    },
+    {
+      "epoch": 0.42844444444444446,
+      "grad_norm": 0.34390668098915267,
+      "learning_rate": 0.0001276756467474128,
+      "loss": 0.6155,
+      "step": 2410
+    },
+    {
+      "epoch": 0.42862222222222224,
+      "grad_norm": 0.3989493163609929,
+      "learning_rate": 0.0001276203107415303,
+      "loss": 0.6654,
+      "step": 2411
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3571062608487482,
+      "learning_rate": 0.00012756496557809202,
+      "loss": 0.6202,
+      "step": 2412
+    },
+    {
+      "epoch": 0.4289777777777778,
+      "grad_norm": 0.37953012267640956,
+      "learning_rate": 0.0001275096112754478,
+      "loss": 0.6471,
+      "step": 2413
+    },
+    {
+      "epoch": 0.42915555555555557,
+      "grad_norm": 0.5038903073517091,
+      "learning_rate": 0.00012745424785195043,
+      "loss": 0.6649,
+      "step": 2414
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.35960086466595215,
+      "learning_rate": 0.00012739887532595574,
+      "loss": 0.6343,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4295111111111111,
+      "grad_norm": 0.3701250126537466,
+      "learning_rate": 0.00012734349371582254,
+      "loss": 0.6698,
+      "step": 2416
+    },
+    {
+      "epoch": 0.4296888888888889,
+      "grad_norm": 0.39305014199113314,
+      "learning_rate": 0.0001272881030399127,
+      "loss": 0.6831,
+      "step": 2417
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.37114882218702244,
+      "learning_rate": 0.00012723270331659113,
+      "loss": 0.6398,
+      "step": 2418
+    },
+    {
+      "epoch": 0.43004444444444445,
+      "grad_norm": 0.36658739490811704,
+      "learning_rate": 0.00012717729456422565,
+      "loss": 0.656,
+      "step": 2419
+    },
+    {
+      "epoch": 0.43022222222222223,
+      "grad_norm": 0.3519966513320916,
+      "learning_rate": 0.00012712187680118713,
+      "loss": 0.6201,
+      "step": 2420
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.3551711251204549,
+      "learning_rate": 0.00012706645004584936,
+      "loss": 0.6228,
+      "step": 2421
+    },
+    {
+      "epoch": 0.4305777777777778,
+      "grad_norm": 0.35281476469675455,
+      "learning_rate": 0.00012701101431658924,
+      "loss": 0.6372,
+      "step": 2422
+    },
+    {
+      "epoch": 0.43075555555555556,
+      "grad_norm": 0.37113668378033865,
+      "learning_rate": 0.00012695556963178653,
+      "loss": 0.6471,
+      "step": 2423
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.3541117125805001,
+      "learning_rate": 0.000126900116009824,
+      "loss": 0.6214,
+      "step": 2424
+    },
+    {
+      "epoch": 0.4311111111111111,
+      "grad_norm": 0.41086519789286663,
+      "learning_rate": 0.00012684465346908742,
+      "loss": 0.6837,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4312888888888889,
+      "grad_norm": 0.3569640597504126,
+      "learning_rate": 0.0001267891820279654,
+      "loss": 0.634,
+      "step": 2426
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.40060018196959896,
+      "learning_rate": 0.00012673370170484968,
+      "loss": 0.6263,
+      "step": 2427
+    },
+    {
+      "epoch": 0.43164444444444444,
+      "grad_norm": 0.34122440907489915,
+      "learning_rate": 0.00012667821251813479,
+      "loss": 0.5851,
+      "step": 2428
+    },
+    {
+      "epoch": 0.4318222222222222,
+      "grad_norm": 0.35860201472900305,
+      "learning_rate": 0.00012662271448621822,
+      "loss": 0.6237,
+      "step": 2429
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.38923528850450995,
+      "learning_rate": 0.0001265672076275005,
+      "loss": 0.634,
+      "step": 2430
+    },
+    {
+      "epoch": 0.43217777777777777,
+      "grad_norm": 0.37793130135844377,
+      "learning_rate": 0.00012651169196038496,
+      "loss": 0.6842,
+      "step": 2431
+    },
+    {
+      "epoch": 0.43235555555555555,
+      "grad_norm": 0.3483716650887419,
+      "learning_rate": 0.0001264561675032779,
+      "loss": 0.6139,
+      "step": 2432
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.3525398083572766,
+      "learning_rate": 0.00012640063427458856,
+      "loss": 0.6484,
+      "step": 2433
+    },
+    {
+      "epoch": 0.4327111111111111,
+      "grad_norm": 0.3385149688034773,
+      "learning_rate": 0.00012634509229272908,
+      "loss": 0.5935,
+      "step": 2434
+    },
+    {
+      "epoch": 0.4328888888888889,
+      "grad_norm": 0.34811250059530324,
+      "learning_rate": 0.0001262895415761145,
+      "loss": 0.6183,
+      "step": 2435
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.3316129669981448,
+      "learning_rate": 0.00012623398214316268,
+      "loss": 0.6097,
+      "step": 2436
+    },
+    {
+      "epoch": 0.43324444444444443,
+      "grad_norm": 0.3624292741374519,
+      "learning_rate": 0.00012617841401229446,
+      "loss": 0.622,
+      "step": 2437
+    },
+    {
+      "epoch": 0.4334222222222222,
+      "grad_norm": 0.3673924536245363,
+      "learning_rate": 0.00012612283720193356,
+      "loss": 0.6355,
+      "step": 2438
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.36232833272197146,
+      "learning_rate": 0.00012606725173050653,
+      "loss": 0.6143,
+      "step": 2439
+    },
+    {
+      "epoch": 0.43377777777777776,
+      "grad_norm": 0.38633378002603624,
+      "learning_rate": 0.00012601165761644286,
+      "loss": 0.6009,
+      "step": 2440
+    },
+    {
+      "epoch": 0.43395555555555554,
+      "grad_norm": 0.4596118286035029,
+      "learning_rate": 0.00012595605487817482,
+      "loss": 0.6184,
+      "step": 2441
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.36985006980277124,
+      "learning_rate": 0.00012590044353413758,
+      "loss": 0.6598,
+      "step": 2442
+    },
+    {
+      "epoch": 0.4343111111111111,
+      "grad_norm": 0.36243396049438226,
+      "learning_rate": 0.0001258448236027692,
+      "loss": 0.6999,
+      "step": 2443
+    },
+    {
+      "epoch": 0.43448888888888887,
+      "grad_norm": 0.3789757929823581,
+      "learning_rate": 0.0001257891951025105,
+      "loss": 0.6688,
+      "step": 2444
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.4359100608013618,
+      "learning_rate": 0.00012573355805180523,
+      "loss": 0.5645,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4348444444444444,
+      "grad_norm": 0.3495815058136773,
+      "learning_rate": 0.00012567791246909994,
+      "loss": 0.6598,
+      "step": 2446
+    },
+    {
+      "epoch": 0.4350222222222222,
+      "grad_norm": 0.3666436268806853,
+      "learning_rate": 0.000125622258372844,
+      "loss": 0.6545,
+      "step": 2447
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3647740773109535,
+      "learning_rate": 0.0001255665957814896,
+      "loss": 0.6018,
+      "step": 2448
+    },
+    {
+      "epoch": 0.43537777777777775,
+      "grad_norm": 0.3419448831609855,
+      "learning_rate": 0.00012551092471349177,
+      "loss": 0.6042,
+      "step": 2449
+    },
+    {
+      "epoch": 0.43555555555555553,
+      "grad_norm": 0.40815142500533824,
+      "learning_rate": 0.00012545524518730835,
+      "loss": 0.6712,
+      "step": 2450
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.3696712101640166,
+      "learning_rate": 0.0001253995572213999,
+      "loss": 0.6243,
+      "step": 2451
+    },
+    {
+      "epoch": 0.43591111111111114,
+      "grad_norm": 0.3623882862598176,
+      "learning_rate": 0.00012534386083422997,
+      "loss": 0.5978,
+      "step": 2452
+    },
+    {
+      "epoch": 0.4360888888888889,
+      "grad_norm": 0.3864446537210954,
+      "learning_rate": 0.0001252881560442647,
+      "loss": 0.6683,
+      "step": 2453
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.36650734588962697,
+      "learning_rate": 0.00012523244286997309,
+      "loss": 0.64,
+      "step": 2454
+    },
+    {
+      "epoch": 0.43644444444444447,
+      "grad_norm": 0.36321631434588475,
+      "learning_rate": 0.00012517672132982693,
+      "loss": 0.6238,
+      "step": 2455
+    },
+    {
+      "epoch": 0.43662222222222224,
+      "grad_norm": 0.35887830476599214,
+      "learning_rate": 0.00012512099144230084,
+      "loss": 0.6964,
+      "step": 2456
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3523114257936633,
+      "learning_rate": 0.00012506525322587207,
+      "loss": 0.6043,
+      "step": 2457
+    },
+    {
+      "epoch": 0.4369777777777778,
+      "grad_norm": 0.3396440867494397,
+      "learning_rate": 0.00012500950669902075,
+      "loss": 0.6065,
+      "step": 2458
+    },
+    {
+      "epoch": 0.4371555555555556,
+      "grad_norm": 0.34500582618109993,
+      "learning_rate": 0.00012495375188022973,
+      "loss": 0.6218,
+      "step": 2459
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.35652736286516173,
+      "learning_rate": 0.0001248979887879846,
+      "loss": 0.6105,
+      "step": 2460
+    },
+    {
+      "epoch": 0.43751111111111113,
+      "grad_norm": 0.3532454915551023,
+      "learning_rate": 0.00012484221744077367,
+      "loss": 0.6501,
+      "step": 2461
+    },
+    {
+      "epoch": 0.4376888888888889,
+      "grad_norm": 0.3751697591005347,
+      "learning_rate": 0.00012478643785708806,
+      "loss": 0.6109,
+      "step": 2462
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.3576413314460664,
+      "learning_rate": 0.00012473065005542155,
+      "loss": 0.6042,
+      "step": 2463
+    },
+    {
+      "epoch": 0.43804444444444446,
+      "grad_norm": 0.36242225247095117,
+      "learning_rate": 0.00012467485405427068,
+      "loss": 0.6535,
+      "step": 2464
+    },
+    {
+      "epoch": 0.43822222222222224,
+      "grad_norm": 0.36517761020329187,
+      "learning_rate": 0.00012461904987213468,
+      "loss": 0.655,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.36253256730110056,
+      "learning_rate": 0.00012456323752751554,
+      "loss": 0.6216,
+      "step": 2466
+    },
+    {
+      "epoch": 0.4385777777777778,
+      "grad_norm": 0.35740430775994647,
+      "learning_rate": 0.00012450741703891788,
+      "loss": 0.6633,
+      "step": 2467
+    },
+    {
+      "epoch": 0.43875555555555557,
+      "grad_norm": 0.34728820177508646,
+      "learning_rate": 0.0001244515884248491,
+      "loss": 0.5837,
+      "step": 2468
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.37899085198417637,
+      "learning_rate": 0.00012439575170381927,
+      "loss": 0.6687,
+      "step": 2469
+    },
+    {
+      "epoch": 0.4391111111111111,
+      "grad_norm": 0.380438314635491,
+      "learning_rate": 0.00012433990689434112,
+      "loss": 0.6858,
+      "step": 2470
+    },
+    {
+      "epoch": 0.4392888888888889,
+      "grad_norm": 0.3597587224277307,
+      "learning_rate": 0.0001242840540149301,
+      "loss": 0.601,
+      "step": 2471
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.42792811192738855,
+      "learning_rate": 0.0001242281930841043,
+      "loss": 0.5907,
+      "step": 2472
+    },
+    {
+      "epoch": 0.43964444444444445,
+      "grad_norm": 0.3511489103832953,
+      "learning_rate": 0.00012417232412038448,
+      "loss": 0.6167,
+      "step": 2473
+    },
+    {
+      "epoch": 0.4398222222222222,
+      "grad_norm": 0.348201520590711,
+      "learning_rate": 0.0001241164471422941,
+      "loss": 0.5894,
+      "step": 2474
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.34978829020802815,
+      "learning_rate": 0.00012406056216835928,
+      "loss": 0.6216,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4401777777777778,
+      "grad_norm": 0.3683729504483215,
+      "learning_rate": 0.00012400466921710874,
+      "loss": 0.5906,
+      "step": 2476
+    },
+    {
+      "epoch": 0.44035555555555556,
+      "grad_norm": 0.35557258402876357,
+      "learning_rate": 0.00012394876830707386,
+      "loss": 0.5703,
+      "step": 2477
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.355893979286471,
+      "learning_rate": 0.0001238928594567887,
+      "loss": 0.6125,
+      "step": 2478
+    },
+    {
+      "epoch": 0.4407111111111111,
+      "grad_norm": 0.3575680617367579,
+      "learning_rate": 0.00012383694268478993,
+      "loss": 0.6494,
+      "step": 2479
+    },
+    {
+      "epoch": 0.4408888888888889,
+      "grad_norm": 0.36079630985236694,
+      "learning_rate": 0.0001237810180096168,
+      "loss": 0.6612,
+      "step": 2480
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.36708244051107575,
+      "learning_rate": 0.0001237250854498112,
+      "loss": 0.6396,
+      "step": 2481
+    },
+    {
+      "epoch": 0.44124444444444444,
+      "grad_norm": 0.3583807231088781,
+      "learning_rate": 0.00012366914502391776,
+      "loss": 0.6217,
+      "step": 2482
+    },
+    {
+      "epoch": 0.4414222222222222,
+      "grad_norm": 0.34840330403646036,
+      "learning_rate": 0.0001236131967504835,
+      "loss": 0.6183,
+      "step": 2483
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.35602707774502895,
+      "learning_rate": 0.00012355724064805823,
+      "loss": 0.6347,
+      "step": 2484
+    },
+    {
+      "epoch": 0.44177777777777777,
+      "grad_norm": 0.3468948554115569,
+      "learning_rate": 0.00012350127673519426,
+      "loss": 0.6358,
+      "step": 2485
+    },
+    {
+      "epoch": 0.44195555555555555,
+      "grad_norm": 0.3723036438014148,
+      "learning_rate": 0.00012344530503044648,
+      "loss": 0.6288,
+      "step": 2486
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.39322452675440267,
+      "learning_rate": 0.00012338932555237242,
+      "loss": 0.6361,
+      "step": 2487
+    },
+    {
+      "epoch": 0.4423111111111111,
+      "grad_norm": 0.35577272957813433,
+      "learning_rate": 0.00012333333831953216,
+      "loss": 0.5971,
+      "step": 2488
+    },
+    {
+      "epoch": 0.4424888888888889,
+      "grad_norm": 0.3583997104842971,
+      "learning_rate": 0.00012327734335048837,
+      "loss": 0.6262,
+      "step": 2489
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.34672771044651235,
+      "learning_rate": 0.0001232213406638062,
+      "loss": 0.6734,
+      "step": 2490
+    },
+    {
+      "epoch": 0.44284444444444443,
+      "grad_norm": 0.344755464929935,
+      "learning_rate": 0.00012316533027805353,
+      "loss": 0.6225,
+      "step": 2491
+    },
+    {
+      "epoch": 0.4430222222222222,
+      "grad_norm": 0.3703909236340882,
+      "learning_rate": 0.0001231093122118006,
+      "loss": 0.6603,
+      "step": 2492
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.3637871650426539,
+      "learning_rate": 0.00012305328648362028,
+      "loss": 0.6046,
+      "step": 2493
+    },
+    {
+      "epoch": 0.44337777777777776,
+      "grad_norm": 0.3634852893036694,
+      "learning_rate": 0.00012299725311208808,
+      "loss": 0.638,
+      "step": 2494
+    },
+    {
+      "epoch": 0.44355555555555554,
+      "grad_norm": 0.3554437317670213,
+      "learning_rate": 0.00012294121211578184,
+      "loss": 0.606,
+      "step": 2495
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.36010220346759225,
+      "learning_rate": 0.00012288516351328208,
+      "loss": 0.6202,
+      "step": 2496
+    },
+    {
+      "epoch": 0.4439111111111111,
+      "grad_norm": 0.3533957637130856,
+      "learning_rate": 0.0001228291073231718,
+      "loss": 0.6206,
+      "step": 2497
+    },
+    {
+      "epoch": 0.44408888888888887,
+      "grad_norm": 0.36553838722476206,
+      "learning_rate": 0.00012277304356403656,
+      "loss": 0.6145,
+      "step": 2498
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.3600575736028397,
+      "learning_rate": 0.0001227169722544643,
+      "loss": 0.5992,
+      "step": 2499
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.38262337271538,
+      "learning_rate": 0.0001226608934130456,
+      "loss": 0.6422,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4446222222222222,
+      "grad_norm": 0.35309795321313664,
+      "learning_rate": 0.0001226048070583735,
+      "loss": 0.6093,
+      "step": 2501
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3537591815446702,
+      "learning_rate": 0.00012254871320904347,
+      "loss": 0.5859,
+      "step": 2502
+    },
+    {
+      "epoch": 0.4449777777777778,
+      "grad_norm": 0.38955417158894107,
+      "learning_rate": 0.0001224926118836535,
+      "loss": 0.6054,
+      "step": 2503
+    },
+    {
+      "epoch": 0.4451555555555556,
+      "grad_norm": 0.3880502150274596,
+      "learning_rate": 0.00012243650310080412,
+      "loss": 0.6261,
+      "step": 2504
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.3507029394431925,
+      "learning_rate": 0.0001223803868790983,
+      "loss": 0.6313,
+      "step": 2505
+    },
+    {
+      "epoch": 0.44551111111111114,
+      "grad_norm": 0.334419017104526,
+      "learning_rate": 0.00012232426323714136,
+      "loss": 0.6239,
+      "step": 2506
+    },
+    {
+      "epoch": 0.4456888888888889,
+      "grad_norm": 0.3640862568304381,
+      "learning_rate": 0.00012226813219354122,
+      "loss": 0.6512,
+      "step": 2507
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.33968625510731293,
+      "learning_rate": 0.00012221199376690825,
+      "loss": 0.6077,
+      "step": 2508
+    },
+    {
+      "epoch": 0.44604444444444447,
+      "grad_norm": 0.3606677468042819,
+      "learning_rate": 0.00012215584797585524,
+      "loss": 0.6605,
+      "step": 2509
+    },
+    {
+      "epoch": 0.44622222222222224,
+      "grad_norm": 0.3370036884890409,
+      "learning_rate": 0.00012209969483899735,
+      "loss": 0.6109,
+      "step": 2510
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3727276030707424,
+      "learning_rate": 0.00012204353437495228,
+      "loss": 0.5754,
+      "step": 2511
+    },
+    {
+      "epoch": 0.4465777777777778,
+      "grad_norm": 0.3814855270365171,
+      "learning_rate": 0.00012198736660234009,
+      "loss": 0.6124,
+      "step": 2512
+    },
+    {
+      "epoch": 0.4467555555555556,
+      "grad_norm": 0.34305297471379737,
+      "learning_rate": 0.00012193119153978332,
+      "loss": 0.5901,
+      "step": 2513
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.3215511453171052,
+      "learning_rate": 0.00012187500920590689,
+      "loss": 0.59,
+      "step": 2514
+    },
+    {
+      "epoch": 0.4471111111111111,
+      "grad_norm": 0.35959590890962223,
+      "learning_rate": 0.0001218188196193381,
+      "loss": 0.6841,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4472888888888889,
+      "grad_norm": 0.3649763905835896,
+      "learning_rate": 0.00012176262279870673,
+      "loss": 0.6506,
+      "step": 2516
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.3601431735851272,
+      "learning_rate": 0.0001217064187626449,
+      "loss": 0.6435,
+      "step": 2517
+    },
+    {
+      "epoch": 0.44764444444444446,
+      "grad_norm": 0.35964138205055174,
+      "learning_rate": 0.00012165020752978718,
+      "loss": 0.6535,
+      "step": 2518
+    },
+    {
+      "epoch": 0.44782222222222223,
+      "grad_norm": 0.34726631250201273,
+      "learning_rate": 0.00012159398911877045,
+      "loss": 0.6134,
+      "step": 2519
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.35449870451567134,
+      "learning_rate": 0.00012153776354823401,
+      "loss": 0.6081,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4481777777777778,
+      "grad_norm": 0.34573664350869815,
+      "learning_rate": 0.00012148153083681954,
+      "loss": 0.6142,
+      "step": 2521
+    },
+    {
+      "epoch": 0.44835555555555556,
+      "grad_norm": 0.36060712389294625,
+      "learning_rate": 0.0001214252910031711,
+      "loss": 0.6502,
+      "step": 2522
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.3826132897859077,
+      "learning_rate": 0.00012136904406593507,
+      "loss": 0.6432,
+      "step": 2523
+    },
+    {
+      "epoch": 0.4487111111111111,
+      "grad_norm": 0.34896381190658043,
+      "learning_rate": 0.00012131279004376024,
+      "loss": 0.6414,
+      "step": 2524
+    },
+    {
+      "epoch": 0.4488888888888889,
+      "grad_norm": 0.3600605051763946,
+      "learning_rate": 0.00012125652895529766,
+      "loss": 0.6405,
+      "step": 2525
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.36541280183275093,
+      "learning_rate": 0.00012120026081920084,
+      "loss": 0.5968,
+      "step": 2526
+    },
+    {
+      "epoch": 0.44924444444444445,
+      "grad_norm": 0.3562322280328803,
+      "learning_rate": 0.00012114398565412553,
+      "loss": 0.6113,
+      "step": 2527
+    },
+    {
+      "epoch": 0.4494222222222222,
+      "grad_norm": 0.36541262544716824,
+      "learning_rate": 0.00012108770347872982,
+      "loss": 0.6251,
+      "step": 2528
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.35789351173943645,
+      "learning_rate": 0.0001210314143116742,
+      "loss": 0.6362,
+      "step": 2529
+    },
+    {
+      "epoch": 0.4497777777777778,
+      "grad_norm": 0.3812303750695318,
+      "learning_rate": 0.00012097511817162139,
+      "loss": 0.6218,
+      "step": 2530
+    },
+    {
+      "epoch": 0.44995555555555555,
+      "grad_norm": 0.3548281963049733,
+      "learning_rate": 0.00012091881507723651,
+      "loss": 0.6274,
+      "step": 2531
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.3490925903863767,
+      "learning_rate": 0.00012086250504718687,
+      "loss": 0.6224,
+      "step": 2532
+    },
+    {
+      "epoch": 0.4503111111111111,
+      "grad_norm": 0.372575977805126,
+      "learning_rate": 0.00012080618810014221,
+      "loss": 0.6413,
+      "step": 2533
+    },
+    {
+      "epoch": 0.4504888888888889,
+      "grad_norm": 0.3664897624480647,
+      "learning_rate": 0.00012074986425477445,
+      "loss": 0.6512,
+      "step": 2534
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.35200150464177166,
+      "learning_rate": 0.00012069353352975787,
+      "loss": 0.6204,
+      "step": 2535
+    },
+    {
+      "epoch": 0.45084444444444444,
+      "grad_norm": 0.35660079439327175,
+      "learning_rate": 0.00012063719594376901,
+      "loss": 0.6337,
+      "step": 2536
+    },
+    {
+      "epoch": 0.4510222222222222,
+      "grad_norm": 0.3527386080500564,
+      "learning_rate": 0.00012058085151548668,
+      "loss": 0.6444,
+      "step": 2537
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3495730898005332,
+      "learning_rate": 0.00012052450026359197,
+      "loss": 0.6088,
+      "step": 2538
+    },
+    {
+      "epoch": 0.45137777777777777,
+      "grad_norm": 0.4030111494261051,
+      "learning_rate": 0.00012046814220676817,
+      "loss": 0.5907,
+      "step": 2539
+    },
+    {
+      "epoch": 0.45155555555555554,
+      "grad_norm": 0.3431991453405851,
+      "learning_rate": 0.00012041177736370093,
+      "loss": 0.6022,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.36766567743241135,
+      "learning_rate": 0.00012035540575307809,
+      "loss": 0.6592,
+      "step": 2541
+    },
+    {
+      "epoch": 0.4519111111111111,
+      "grad_norm": 0.38535132266059796,
+      "learning_rate": 0.00012029902739358971,
+      "loss": 0.6736,
+      "step": 2542
+    },
+    {
+      "epoch": 0.4520888888888889,
+      "grad_norm": 0.3836992370242985,
+      "learning_rate": 0.00012024264230392819,
+      "loss": 0.6622,
+      "step": 2543
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.3354627586731223,
+      "learning_rate": 0.00012018625050278802,
+      "loss": 0.5926,
+      "step": 2544
+    },
+    {
+      "epoch": 0.4524444444444444,
+      "grad_norm": 0.3571685686510973,
+      "learning_rate": 0.00012012985200886602,
+      "loss": 0.6209,
+      "step": 2545
+    },
+    {
+      "epoch": 0.4526222222222222,
+      "grad_norm": 0.3541162745120483,
+      "learning_rate": 0.00012007344684086119,
+      "loss": 0.6508,
+      "step": 2546
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3831361097772331,
+      "learning_rate": 0.00012001703501747475,
+      "loss": 0.6179,
+      "step": 2547
+    },
+    {
+      "epoch": 0.45297777777777776,
+      "grad_norm": 0.36551304036628957,
+      "learning_rate": 0.00011996061655741013,
+      "loss": 0.639,
+      "step": 2548
+    },
+    {
+      "epoch": 0.45315555555555553,
+      "grad_norm": 0.37321440724637994,
+      "learning_rate": 0.00011990419147937295,
+      "loss": 0.6216,
+      "step": 2549
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.36736687257775785,
+      "learning_rate": 0.00011984775980207105,
+      "loss": 0.6485,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4535111111111111,
+      "grad_norm": 0.34232912567235985,
+      "learning_rate": 0.0001197913215442144,
+      "loss": 0.5696,
+      "step": 2551
+    },
+    {
+      "epoch": 0.45368888888888886,
+      "grad_norm": 0.39308572174384604,
+      "learning_rate": 0.00011973487672451523,
+      "loss": 0.6747,
+      "step": 2552
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.3438333290686015,
+      "learning_rate": 0.00011967842536168785,
+      "loss": 0.6,
+      "step": 2553
+    },
+    {
+      "epoch": 0.4540444444444444,
+      "grad_norm": 0.3506880602989641,
+      "learning_rate": 0.00011962196747444882,
+      "loss": 0.6287,
+      "step": 2554
+    },
+    {
+      "epoch": 0.45422222222222225,
+      "grad_norm": 0.3627018536646449,
+      "learning_rate": 0.00011956550308151689,
+      "loss": 0.607,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.35503347776951466,
+      "learning_rate": 0.00011950903220161285,
+      "loss": 0.5889,
+      "step": 2556
+    },
+    {
+      "epoch": 0.4545777777777778,
+      "grad_norm": 0.33814420598465106,
+      "learning_rate": 0.00011945255485345972,
+      "loss": 0.6156,
+      "step": 2557
+    },
+    {
+      "epoch": 0.4547555555555556,
+      "grad_norm": 0.3662821736322486,
+      "learning_rate": 0.00011939607105578266,
+      "loss": 0.6193,
+      "step": 2558
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.35731199879165426,
+      "learning_rate": 0.00011933958082730894,
+      "loss": 0.6694,
+      "step": 2559
+    },
+    {
+      "epoch": 0.45511111111111113,
+      "grad_norm": 0.3325573301812507,
+      "learning_rate": 0.000119283084186768,
+      "loss": 0.5469,
+      "step": 2560
+    },
+    {
+      "epoch": 0.4552888888888889,
+      "grad_norm": 0.5457887907196308,
+      "learning_rate": 0.00011922658115289141,
+      "loss": 0.6855,
+      "step": 2561
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.36198247507661996,
+      "learning_rate": 0.00011917007174441279,
+      "loss": 0.6257,
+      "step": 2562
+    },
+    {
+      "epoch": 0.45564444444444446,
+      "grad_norm": 0.35081646164599617,
+      "learning_rate": 0.00011911355598006794,
+      "loss": 0.5782,
+      "step": 2563
+    },
+    {
+      "epoch": 0.45582222222222224,
+      "grad_norm": 0.35967694469167877,
+      "learning_rate": 0.00011905703387859475,
+      "loss": 0.6249,
+      "step": 2564
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3486612584742585,
+      "learning_rate": 0.0001190005054587332,
+      "loss": 0.6371,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4561777777777778,
+      "grad_norm": 0.3713135940760266,
+      "learning_rate": 0.00011894397073922536,
+      "loss": 0.6545,
+      "step": 2566
+    },
+    {
+      "epoch": 0.45635555555555557,
+      "grad_norm": 0.36423054497226026,
+      "learning_rate": 0.00011888742973881543,
+      "loss": 0.6611,
+      "step": 2567
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.3625864988060894,
+      "learning_rate": 0.00011883088247624965,
+      "loss": 0.6082,
+      "step": 2568
+    },
+    {
+      "epoch": 0.4567111111111111,
+      "grad_norm": 0.33217904307380747,
+      "learning_rate": 0.00011877432897027637,
+      "loss": 0.6047,
+      "step": 2569
+    },
+    {
+      "epoch": 0.4568888888888889,
+      "grad_norm": 0.338344936754837,
+      "learning_rate": 0.00011871776923964592,
+      "loss": 0.5985,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.3609757180115861,
+      "learning_rate": 0.00011866120330311086,
+      "loss": 0.6193,
+      "step": 2571
+    },
+    {
+      "epoch": 0.45724444444444445,
+      "grad_norm": 0.3512902947713863,
+      "learning_rate": 0.00011860463117942567,
+      "loss": 0.6116,
+      "step": 2572
+    },
+    {
+      "epoch": 0.45742222222222223,
+      "grad_norm": 0.3823292504112605,
+      "learning_rate": 0.00011854805288734689,
+      "loss": 0.6817,
+      "step": 2573
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.34817556542078,
+      "learning_rate": 0.0001184914684456332,
+      "loss": 0.5696,
+      "step": 2574
+    },
+    {
+      "epoch": 0.4577777777777778,
+      "grad_norm": 0.35394866568776095,
+      "learning_rate": 0.00011843487787304521,
+      "loss": 0.5984,
+      "step": 2575
+    },
+    {
+      "epoch": 0.45795555555555556,
+      "grad_norm": 0.3451786970155022,
+      "learning_rate": 0.00011837828118834564,
+      "loss": 0.6008,
+      "step": 2576
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.3639374317634931,
+      "learning_rate": 0.00011832167841029918,
+      "loss": 0.612,
+      "step": 2577
+    },
+    {
+      "epoch": 0.4583111111111111,
+      "grad_norm": 0.34210717123781936,
+      "learning_rate": 0.00011826506955767258,
+      "loss": 0.5995,
+      "step": 2578
+    },
+    {
+      "epoch": 0.4584888888888889,
+      "grad_norm": 0.35947946747542225,
+      "learning_rate": 0.00011820845464923458,
+      "loss": 0.6229,
+      "step": 2579
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.378625822610655,
+      "learning_rate": 0.00011815183370375595,
+      "loss": 0.6524,
+      "step": 2580
+    },
+    {
+      "epoch": 0.45884444444444444,
+      "grad_norm": 0.34824514391996814,
+      "learning_rate": 0.00011809520674000944,
+      "loss": 0.6016,
+      "step": 2581
+    },
+    {
+      "epoch": 0.4590222222222222,
+      "grad_norm": 0.3578488579196418,
+      "learning_rate": 0.00011803857377676983,
+      "loss": 0.6026,
+      "step": 2582
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.36466370898040745,
+      "learning_rate": 0.00011798193483281386,
+      "loss": 0.5943,
+      "step": 2583
+    },
+    {
+      "epoch": 0.4593777777777778,
+      "grad_norm": 0.3764979473198388,
+      "learning_rate": 0.00011792528992692022,
+      "loss": 0.6732,
+      "step": 2584
+    },
+    {
+      "epoch": 0.45955555555555555,
+      "grad_norm": 0.3759736729902727,
+      "learning_rate": 0.00011786863907786965,
+      "loss": 0.5957,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.36624487114376464,
+      "learning_rate": 0.00011781198230444479,
+      "loss": 0.6184,
+      "step": 2586
+    },
+    {
+      "epoch": 0.4599111111111111,
+      "grad_norm": 0.33735658839872296,
+      "learning_rate": 0.00011775531962543036,
+      "loss": 0.6043,
+      "step": 2587
+    },
+    {
+      "epoch": 0.4600888888888889,
+      "grad_norm": 0.3511801928186866,
+      "learning_rate": 0.00011769865105961283,
+      "loss": 0.6099,
+      "step": 2588
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.34014649416870824,
+      "learning_rate": 0.00011764197662578086,
+      "loss": 0.5811,
+      "step": 2589
+    },
+    {
+      "epoch": 0.46044444444444443,
+      "grad_norm": 0.3591190467678517,
+      "learning_rate": 0.0001175852963427249,
+      "loss": 0.6081,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4606222222222222,
+      "grad_norm": 0.35996571845822695,
+      "learning_rate": 0.00011752861022923736,
+      "loss": 0.6306,
+      "step": 2591
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3613898953089732,
+      "learning_rate": 0.00011747191830411264,
+      "loss": 0.6178,
+      "step": 2592
+    },
+    {
+      "epoch": 0.46097777777777776,
+      "grad_norm": 0.3415070577647176,
+      "learning_rate": 0.00011741522058614705,
+      "loss": 0.639,
+      "step": 2593
+    },
+    {
+      "epoch": 0.46115555555555554,
+      "grad_norm": 0.3656135584756059,
+      "learning_rate": 0.00011735851709413874,
+      "loss": 0.625,
+      "step": 2594
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.3499049734452857,
+      "learning_rate": 0.00011730180784688789,
+      "loss": 0.6405,
+      "step": 2595
+    },
+    {
+      "epoch": 0.4615111111111111,
+      "grad_norm": 0.36387902605449524,
+      "learning_rate": 0.00011724509286319654,
+      "loss": 0.6413,
+      "step": 2596
+    },
+    {
+      "epoch": 0.46168888888888887,
+      "grad_norm": 0.3432788560828658,
+      "learning_rate": 0.0001171883721618686,
+      "loss": 0.5904,
+      "step": 2597
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.3846126542469513,
+      "learning_rate": 0.00011713164576170992,
+      "loss": 0.6395,
+      "step": 2598
+    },
+    {
+      "epoch": 0.4620444444444444,
+      "grad_norm": 0.37512029931777036,
+      "learning_rate": 0.00011707491368152823,
+      "loss": 0.6526,
+      "step": 2599
+    },
+    {
+      "epoch": 0.4622222222222222,
+      "grad_norm": 0.3652760863439255,
+      "learning_rate": 0.00011701817594013312,
+      "loss": 0.6427,
+      "step": 2600
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.34689977607904615,
+      "learning_rate": 0.00011696143255633607,
+      "loss": 0.6347,
+      "step": 2601
+    },
+    {
+      "epoch": 0.46257777777777775,
+      "grad_norm": 0.36668242103221116,
+      "learning_rate": 0.00011690468354895045,
+      "loss": 0.6533,
+      "step": 2602
+    },
+    {
+      "epoch": 0.46275555555555553,
+      "grad_norm": 0.3718117994604249,
+      "learning_rate": 0.00011684792893679149,
+      "loss": 0.6283,
+      "step": 2603
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.3291103069896437,
+      "learning_rate": 0.00011679116873867624,
+      "loss": 0.5934,
+      "step": 2604
+    },
+    {
+      "epoch": 0.4631111111111111,
+      "grad_norm": 0.3347516924602731,
+      "learning_rate": 0.00011673440297342364,
+      "loss": 0.5844,
+      "step": 2605
+    },
+    {
+      "epoch": 0.46328888888888886,
+      "grad_norm": 0.3507778909885527,
+      "learning_rate": 0.00011667763165985446,
+      "loss": 0.5787,
+      "step": 2606
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.340848584314414,
+      "learning_rate": 0.00011662085481679133,
+      "loss": 0.5553,
+      "step": 2607
+    },
+    {
+      "epoch": 0.46364444444444447,
+      "grad_norm": 0.34700831839306545,
+      "learning_rate": 0.00011656407246305867,
+      "loss": 0.6479,
+      "step": 2608
+    },
+    {
+      "epoch": 0.46382222222222225,
+      "grad_norm": 0.35928217108251026,
+      "learning_rate": 0.0001165072846174828,
+      "loss": 0.6003,
+      "step": 2609
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3512026978141891,
+      "learning_rate": 0.00011645049129889179,
+      "loss": 0.6227,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4641777777777778,
+      "grad_norm": 0.34592416581651336,
+      "learning_rate": 0.00011639369252611552,
+      "loss": 0.6416,
+      "step": 2611
+    },
+    {
+      "epoch": 0.4643555555555556,
+      "grad_norm": 0.3532149210589753,
+      "learning_rate": 0.0001163368883179858,
+      "loss": 0.6372,
+      "step": 2612
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.3977834229037991,
+      "learning_rate": 0.00011628007869333603,
+      "loss": 0.6362,
+      "step": 2613
+    },
+    {
+      "epoch": 0.46471111111111113,
+      "grad_norm": 0.3361528441180401,
+      "learning_rate": 0.0001162232636710016,
+      "loss": 0.5933,
+      "step": 2614
+    },
+    {
+      "epoch": 0.4648888888888889,
+      "grad_norm": 0.346657457987765,
+      "learning_rate": 0.00011616644326981963,
+      "loss": 0.6319,
+      "step": 2615
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.3704249807859668,
+      "learning_rate": 0.00011610961750862897,
+      "loss": 0.6496,
+      "step": 2616
+    },
+    {
+      "epoch": 0.46524444444444446,
+      "grad_norm": 0.36322592742827775,
+      "learning_rate": 0.00011605278640627028,
+      "loss": 0.6162,
+      "step": 2617
+    },
+    {
+      "epoch": 0.46542222222222224,
+      "grad_norm": 0.3412190447748015,
+      "learning_rate": 0.00011599594998158602,
+      "loss": 0.5791,
+      "step": 2618
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.36615520677021174,
+      "learning_rate": 0.00011593910825342043,
+      "loss": 0.6701,
+      "step": 2619
+    },
+    {
+      "epoch": 0.4657777777777778,
+      "grad_norm": 0.3824893850067352,
+      "learning_rate": 0.0001158822612406194,
+      "loss": 0.6309,
+      "step": 2620
+    },
+    {
+      "epoch": 0.46595555555555557,
+      "grad_norm": 0.3669126233490488,
+      "learning_rate": 0.00011582540896203067,
+      "loss": 0.6184,
+      "step": 2621
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.3686145934825217,
+      "learning_rate": 0.00011576855143650371,
+      "loss": 0.6139,
+      "step": 2622
+    },
+    {
+      "epoch": 0.4663111111111111,
+      "grad_norm": 0.3768932262212343,
+      "learning_rate": 0.00011571168868288973,
+      "loss": 0.626,
+      "step": 2623
+    },
+    {
+      "epoch": 0.4664888888888889,
+      "grad_norm": 0.3827046669952357,
+      "learning_rate": 0.00011565482072004164,
+      "loss": 0.6296,
+      "step": 2624
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.3460460694096838,
+      "learning_rate": 0.0001155979475668141,
+      "loss": 0.6442,
+      "step": 2625
+    },
+    {
+      "epoch": 0.46684444444444445,
+      "grad_norm": 0.38766965000704273,
+      "learning_rate": 0.00011554106924206347,
+      "loss": 0.6475,
+      "step": 2626
+    },
+    {
+      "epoch": 0.4670222222222222,
+      "grad_norm": 0.3358428689481838,
+      "learning_rate": 0.00011548418576464791,
+      "loss": 0.5847,
+      "step": 2627
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.34018866811431725,
+      "learning_rate": 0.00011542729715342713,
+      "loss": 0.6008,
+      "step": 2628
+    },
+    {
+      "epoch": 0.4673777777777778,
+      "grad_norm": 0.3629115012343357,
+      "learning_rate": 0.00011537040342726271,
+      "loss": 0.6295,
+      "step": 2629
+    },
+    {
+      "epoch": 0.46755555555555556,
+      "grad_norm": 0.3583035374195773,
+      "learning_rate": 0.00011531350460501782,
+      "loss": 0.6551,
+      "step": 2630
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.3605839733646907,
+      "learning_rate": 0.00011525660070555735,
+      "loss": 0.6496,
+      "step": 2631
+    },
+    {
+      "epoch": 0.4679111111111111,
+      "grad_norm": 0.34466580861324947,
+      "learning_rate": 0.0001151996917477479,
+      "loss": 0.5804,
+      "step": 2632
+    },
+    {
+      "epoch": 0.4680888888888889,
+      "grad_norm": 0.36279627612429444,
+      "learning_rate": 0.00011514277775045768,
+      "loss": 0.6807,
+      "step": 2633
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.38075181604467584,
+      "learning_rate": 0.00011508585873255663,
+      "loss": 0.6719,
+      "step": 2634
+    },
+    {
+      "epoch": 0.46844444444444444,
+      "grad_norm": 0.34987587696317807,
+      "learning_rate": 0.00011502893471291636,
+      "loss": 0.645,
+      "step": 2635
+    },
+    {
+      "epoch": 0.4686222222222222,
+      "grad_norm": 0.3448820041592509,
+      "learning_rate": 0.00011497200571041009,
+      "loss": 0.6211,
+      "step": 2636
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.3727510610949181,
+      "learning_rate": 0.00011491507174391271,
+      "loss": 0.6162,
+      "step": 2637
+    },
+    {
+      "epoch": 0.46897777777777777,
+      "grad_norm": 0.3583867282546688,
+      "learning_rate": 0.00011485813283230079,
+      "loss": 0.5574,
+      "step": 2638
+    },
+    {
+      "epoch": 0.46915555555555555,
+      "grad_norm": 0.3594947186395565,
+      "learning_rate": 0.00011480118899445247,
+      "loss": 0.6343,
+      "step": 2639
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.34247734029417154,
+      "learning_rate": 0.00011474424024924759,
+      "loss": 0.5675,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4695111111111111,
+      "grad_norm": 0.3764511634940411,
+      "learning_rate": 0.0001146872866155676,
+      "loss": 0.6159,
+      "step": 2641
+    },
+    {
+      "epoch": 0.4696888888888889,
+      "grad_norm": 0.3531976768111412,
+      "learning_rate": 0.00011463032811229557,
+      "loss": 0.5696,
+      "step": 2642
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.3586694966033011,
+      "learning_rate": 0.00011457336475831612,
+      "loss": 0.6358,
+      "step": 2643
+    },
+    {
+      "epoch": 0.47004444444444443,
+      "grad_norm": 0.37034145685760583,
+      "learning_rate": 0.00011451639657251563,
+      "loss": 0.6487,
+      "step": 2644
+    },
+    {
+      "epoch": 0.4702222222222222,
+      "grad_norm": 0.3830748606355568,
+      "learning_rate": 0.00011445942357378192,
+      "loss": 0.6481,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.35652442490379327,
+      "learning_rate": 0.00011440244578100447,
+      "loss": 0.6074,
+      "step": 2646
+    },
+    {
+      "epoch": 0.47057777777777776,
+      "grad_norm": 0.3568499151748979,
+      "learning_rate": 0.0001143454632130744,
+      "loss": 0.5981,
+      "step": 2647
+    },
+    {
+      "epoch": 0.47075555555555554,
+      "grad_norm": 0.3768724152737397,
+      "learning_rate": 0.00011428847588888434,
+      "loss": 0.6353,
+      "step": 2648
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.3765848864355913,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.6358,
+      "step": 2649
+    },
+    {
+      "epoch": 0.4711111111111111,
+      "grad_norm": 0.36915676679875964,
+      "learning_rate": 0.00011417448704730275,
+      "loss": 0.6045,
+      "step": 2650
+    },
+    {
+      "epoch": 0.47128888888888887,
+      "grad_norm": 0.36233416081947073,
+      "learning_rate": 0.0001141174855677044,
+      "loss": 0.6112,
+      "step": 2651
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.3642104553696075,
+      "learning_rate": 0.00011406047940743239,
+      "loss": 0.6625,
+      "step": 2652
+    },
+    {
+      "epoch": 0.4716444444444444,
+      "grad_norm": 0.3434966818111389,
+      "learning_rate": 0.0001140034685853872,
+      "loss": 0.6053,
+      "step": 2653
+    },
+    {
+      "epoch": 0.4718222222222222,
+      "grad_norm": 0.37966589821930796,
+      "learning_rate": 0.00011394645312047086,
+      "loss": 0.6537,
+      "step": 2654
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.3442634387796601,
+      "learning_rate": 0.00011388943303158693,
+      "loss": 0.6309,
+      "step": 2655
+    },
+    {
+      "epoch": 0.47217777777777775,
+      "grad_norm": 0.35493277038420756,
+      "learning_rate": 0.0001138324083376405,
+      "loss": 0.6601,
+      "step": 2656
+    },
+    {
+      "epoch": 0.47235555555555553,
+      "grad_norm": 0.3541468266169037,
+      "learning_rate": 0.0001137753790575382,
+      "loss": 0.6371,
+      "step": 2657
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.3340969839230348,
+      "learning_rate": 0.00011371834521018818,
+      "loss": 0.5654,
+      "step": 2658
+    },
+    {
+      "epoch": 0.47271111111111114,
+      "grad_norm": 0.35130500748332294,
+      "learning_rate": 0.00011366130681450008,
+      "loss": 0.6247,
+      "step": 2659
+    },
+    {
+      "epoch": 0.4728888888888889,
+      "grad_norm": 0.3291968480511084,
+      "learning_rate": 0.00011360426388938508,
+      "loss": 0.5779,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.3271214931402535,
+      "learning_rate": 0.00011354721645375588,
+      "loss": 0.5881,
+      "step": 2661
+    },
+    {
+      "epoch": 0.47324444444444447,
+      "grad_norm": 0.3740586765449767,
+      "learning_rate": 0.00011349016452652657,
+      "loss": 0.6168,
+      "step": 2662
+    },
+    {
+      "epoch": 0.47342222222222224,
+      "grad_norm": 0.3828461818661629,
+      "learning_rate": 0.00011343310812661286,
+      "loss": 0.7076,
+      "step": 2663
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.35349321585283905,
+      "learning_rate": 0.00011337604727293185,
+      "loss": 0.5952,
+      "step": 2664
+    },
+    {
+      "epoch": 0.4737777777777778,
+      "grad_norm": 0.33341587424963764,
+      "learning_rate": 0.00011331898198440219,
+      "loss": 0.5731,
+      "step": 2665
+    },
+    {
+      "epoch": 0.4739555555555556,
+      "grad_norm": 0.38392100296688336,
+      "learning_rate": 0.00011326191227994391,
+      "loss": 0.6736,
+      "step": 2666
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.3917244196738235,
+      "learning_rate": 0.00011320483817847862,
+      "loss": 0.6303,
+      "step": 2667
+    },
+    {
+      "epoch": 0.47431111111111113,
+      "grad_norm": 0.3532778146515433,
+      "learning_rate": 0.0001131477596989293,
+      "loss": 0.5841,
+      "step": 2668
+    },
+    {
+      "epoch": 0.4744888888888889,
+      "grad_norm": 0.3466873851107952,
+      "learning_rate": 0.00011309067686022037,
+      "loss": 0.6226,
+      "step": 2669
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.3464991699454127,
+      "learning_rate": 0.00011303358968127778,
+      "loss": 0.5929,
+      "step": 2670
+    },
+    {
+      "epoch": 0.47484444444444446,
+      "grad_norm": 0.35501088118365143,
+      "learning_rate": 0.00011297649818102884,
+      "loss": 0.6398,
+      "step": 2671
+    },
+    {
+      "epoch": 0.47502222222222223,
+      "grad_norm": 0.349899933739195,
+      "learning_rate": 0.00011291940237840235,
+      "loss": 0.6163,
+      "step": 2672
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.38855265897666125,
+      "learning_rate": 0.0001128623022923285,
+      "loss": 0.645,
+      "step": 2673
+    },
+    {
+      "epoch": 0.4753777777777778,
+      "grad_norm": 0.36794088568281935,
+      "learning_rate": 0.00011280519794173889,
+      "loss": 0.6368,
+      "step": 2674
+    },
+    {
+      "epoch": 0.47555555555555556,
+      "grad_norm": 0.34980805194622705,
+      "learning_rate": 0.00011274808934556655,
+      "loss": 0.5698,
+      "step": 2675
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.37832759436319335,
+      "learning_rate": 0.00011269097652274596,
+      "loss": 0.6153,
+      "step": 2676
+    },
+    {
+      "epoch": 0.4759111111111111,
+      "grad_norm": 0.3596230483450509,
+      "learning_rate": 0.00011263385949221295,
+      "loss": 0.5895,
+      "step": 2677
+    },
+    {
+      "epoch": 0.4760888888888889,
+      "grad_norm": 0.36089900480039894,
+      "learning_rate": 0.00011257673827290471,
+      "loss": 0.6572,
+      "step": 2678
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.36648935021651785,
+      "learning_rate": 0.00011251961288375994,
+      "loss": 0.6137,
+      "step": 2679
+    },
+    {
+      "epoch": 0.47644444444444445,
+      "grad_norm": 0.32777743291177003,
+      "learning_rate": 0.0001124624833437186,
+      "loss": 0.5711,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4766222222222222,
+      "grad_norm": 0.354466514045681,
+      "learning_rate": 0.0001124053496717221,
+      "loss": 0.6311,
+      "step": 2681
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.36897056553479496,
+      "learning_rate": 0.00011234821188671319,
+      "loss": 0.6035,
+      "step": 2682
+    },
+    {
+      "epoch": 0.4769777777777778,
+      "grad_norm": 0.3851188681869118,
+      "learning_rate": 0.00011229107000763597,
+      "loss": 0.652,
+      "step": 2683
+    },
+    {
+      "epoch": 0.47715555555555556,
+      "grad_norm": 0.36807460938564895,
+      "learning_rate": 0.00011223392405343594,
+      "loss": 0.6049,
+      "step": 2684
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.3778302718415281,
+      "learning_rate": 0.00011217677404305993,
+      "loss": 0.6808,
+      "step": 2685
+    },
+    {
+      "epoch": 0.4775111111111111,
+      "grad_norm": 0.36782968532147403,
+      "learning_rate": 0.00011211961999545609,
+      "loss": 0.5952,
+      "step": 2686
+    },
+    {
+      "epoch": 0.4776888888888889,
+      "grad_norm": 0.3354434904344047,
+      "learning_rate": 0.00011206246192957391,
+      "loss": 0.603,
+      "step": 2687
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.41553175655275,
+      "learning_rate": 0.0001120052998643643,
+      "loss": 0.6028,
+      "step": 2688
+    },
+    {
+      "epoch": 0.47804444444444444,
+      "grad_norm": 0.35055490126767114,
+      "learning_rate": 0.00011194813381877937,
+      "loss": 0.647,
+      "step": 2689
+    },
+    {
+      "epoch": 0.4782222222222222,
+      "grad_norm": 0.41273037053461203,
+      "learning_rate": 0.00011189096381177265,
+      "loss": 0.604,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.3400119695433725,
+      "learning_rate": 0.00011183378986229891,
+      "loss": 0.6049,
+      "step": 2691
+    },
+    {
+      "epoch": 0.47857777777777777,
+      "grad_norm": 0.3406063104757223,
+      "learning_rate": 0.00011177661198931426,
+      "loss": 0.6007,
+      "step": 2692
+    },
+    {
+      "epoch": 0.47875555555555555,
+      "grad_norm": 0.33537977158680193,
+      "learning_rate": 0.00011171943021177615,
+      "loss": 0.6297,
+      "step": 2693
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.3720330247363786,
+      "learning_rate": 0.00011166224454864325,
+      "loss": 0.6691,
+      "step": 2694
+    },
+    {
+      "epoch": 0.4791111111111111,
+      "grad_norm": 0.34645248508811,
+      "learning_rate": 0.00011160505501887555,
+      "loss": 0.6285,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4792888888888889,
+      "grad_norm": 0.3525730364851391,
+      "learning_rate": 0.00011154786164143433,
+      "loss": 0.6425,
+      "step": 2696
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.332652464608944,
+      "learning_rate": 0.00011149066443528218,
+      "loss": 0.5205,
+      "step": 2697
+    },
+    {
+      "epoch": 0.47964444444444443,
+      "grad_norm": 0.35771403998277956,
+      "learning_rate": 0.00011143346341938288,
+      "loss": 0.6244,
+      "step": 2698
+    },
+    {
+      "epoch": 0.4798222222222222,
+      "grad_norm": 0.38398350996700514,
+      "learning_rate": 0.00011137625861270151,
+      "loss": 0.5586,
+      "step": 2699
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.38893302054915424,
+      "learning_rate": 0.00011131905003420442,
+      "loss": 0.6298,
+      "step": 2700
+    },
+    {
+      "epoch": 0.48017777777777776,
+      "grad_norm": 0.3391839272542596,
+      "learning_rate": 0.00011126183770285918,
+      "loss": 0.5568,
+      "step": 2701
+    },
+    {
+      "epoch": 0.48035555555555554,
+      "grad_norm": 0.35387103703885525,
+      "learning_rate": 0.00011120462163763468,
+      "loss": 0.6333,
+      "step": 2702
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.34988748314485196,
+      "learning_rate": 0.00011114740185750093,
+      "loss": 0.6163,
+      "step": 2703
+    },
+    {
+      "epoch": 0.4807111111111111,
+      "grad_norm": 0.36559732127875183,
+      "learning_rate": 0.00011109017838142928,
+      "loss": 0.6038,
+      "step": 2704
+    },
+    {
+      "epoch": 0.48088888888888887,
+      "grad_norm": 0.3657990552249299,
+      "learning_rate": 0.00011103295122839221,
+      "loss": 0.6599,
+      "step": 2705
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.3402190427312039,
+      "learning_rate": 0.00011097572041736353,
+      "loss": 0.6221,
+      "step": 2706
+    },
+    {
+      "epoch": 0.4812444444444444,
+      "grad_norm": 0.35260149677985864,
+      "learning_rate": 0.00011091848596731817,
+      "loss": 0.6225,
+      "step": 2707
+    },
+    {
+      "epoch": 0.4814222222222222,
+      "grad_norm": 0.3638809115304859,
+      "learning_rate": 0.00011086124789723232,
+      "loss": 0.6187,
+      "step": 2708
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3489193289546153,
+      "learning_rate": 0.0001108040062260833,
+      "loss": 0.6052,
+      "step": 2709
+    },
+    {
+      "epoch": 0.4817777777777778,
+      "grad_norm": 0.3517204828552794,
+      "learning_rate": 0.00011074676097284973,
+      "loss": 0.596,
+      "step": 2710
+    },
+    {
+      "epoch": 0.4819555555555556,
+      "grad_norm": 0.34309708875835576,
+      "learning_rate": 0.00011068951215651132,
+      "loss": 0.5934,
+      "step": 2711
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.36781363888822954,
+      "learning_rate": 0.00011063225979604899,
+      "loss": 0.6136,
+      "step": 2712
+    },
+    {
+      "epoch": 0.48231111111111113,
+      "grad_norm": 0.34618456072920345,
+      "learning_rate": 0.00011057500391044489,
+      "loss": 0.6213,
+      "step": 2713
+    },
+    {
+      "epoch": 0.4824888888888889,
+      "grad_norm": 0.37735825252498334,
+      "learning_rate": 0.00011051774451868226,
+      "loss": 0.6437,
+      "step": 2714
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.35199866517338113,
+      "learning_rate": 0.00011046048163974558,
+      "loss": 0.6088,
+      "step": 2715
+    },
+    {
+      "epoch": 0.48284444444444446,
+      "grad_norm": 0.34466995360901276,
+      "learning_rate": 0.00011040321529262041,
+      "loss": 0.6049,
+      "step": 2716
+    },
+    {
+      "epoch": 0.48302222222222224,
+      "grad_norm": 0.3924179006343552,
+      "learning_rate": 0.0001103459454962935,
+      "loss": 0.6667,
+      "step": 2717
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3674466116361259,
+      "learning_rate": 0.00011028867226975272,
+      "loss": 0.6302,
+      "step": 2718
+    },
+    {
+      "epoch": 0.4833777777777778,
+      "grad_norm": 0.3540570626167075,
+      "learning_rate": 0.00011023139563198714,
+      "loss": 0.6062,
+      "step": 2719
+    },
+    {
+      "epoch": 0.48355555555555557,
+      "grad_norm": 0.3641240841142744,
+      "learning_rate": 0.00011017411560198686,
+      "loss": 0.5971,
+      "step": 2720
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.4154207130462865,
+      "learning_rate": 0.00011011683219874323,
+      "loss": 0.6533,
+      "step": 2721
+    },
+    {
+      "epoch": 0.4839111111111111,
+      "grad_norm": 0.34079782344676135,
+      "learning_rate": 0.00011005954544124862,
+      "loss": 0.5708,
+      "step": 2722
+    },
+    {
+      "epoch": 0.4840888888888889,
+      "grad_norm": 0.3677213002409934,
+      "learning_rate": 0.00011000225534849649,
+      "loss": 0.6276,
+      "step": 2723
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.3577200586354648,
+      "learning_rate": 0.0001099449619394815,
+      "loss": 0.5544,
+      "step": 2724
+    },
+    {
+      "epoch": 0.48444444444444446,
+      "grad_norm": 0.36319426448018627,
+      "learning_rate": 0.00010988766523319935,
+      "loss": 0.6471,
+      "step": 2725
+    },
+    {
+      "epoch": 0.48462222222222223,
+      "grad_norm": 0.35324684939353757,
+      "learning_rate": 0.00010983036524864689,
+      "loss": 0.6384,
+      "step": 2726
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.3609013571048923,
+      "learning_rate": 0.00010977306200482195,
+      "loss": 0.6265,
+      "step": 2727
+    },
+    {
+      "epoch": 0.4849777777777778,
+      "grad_norm": 0.3518419956554968,
+      "learning_rate": 0.00010971575552072357,
+      "loss": 0.5932,
+      "step": 2728
+    },
+    {
+      "epoch": 0.48515555555555556,
+      "grad_norm": 0.3658745675549795,
+      "learning_rate": 0.00010965844581535178,
+      "loss": 0.6221,
+      "step": 2729
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.36422447564502797,
+      "learning_rate": 0.0001096011329077077,
+      "loss": 0.6436,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4855111111111111,
+      "grad_norm": 0.3774567614812268,
+      "learning_rate": 0.00010954381681679352,
+      "loss": 0.6276,
+      "step": 2731
+    },
+    {
+      "epoch": 0.4856888888888889,
+      "grad_norm": 0.37889147254460265,
+      "learning_rate": 0.00010948649756161246,
+      "loss": 0.6099,
+      "step": 2732
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.33818971949149906,
+      "learning_rate": 0.0001094291751611688,
+      "loss": 0.635,
+      "step": 2733
+    },
+    {
+      "epoch": 0.48604444444444445,
+      "grad_norm": 0.3741478913319296,
+      "learning_rate": 0.00010937184963446788,
+      "loss": 0.6706,
+      "step": 2734
+    },
+    {
+      "epoch": 0.4862222222222222,
+      "grad_norm": 0.36351168281219204,
+      "learning_rate": 0.00010931452100051605,
+      "loss": 0.6102,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3356327170524334,
+      "learning_rate": 0.00010925718927832073,
+      "loss": 0.6024,
+      "step": 2736
+    },
+    {
+      "epoch": 0.4865777777777778,
+      "grad_norm": 0.3572987085131686,
+      "learning_rate": 0.00010919985448689031,
+      "loss": 0.603,
+      "step": 2737
+    },
+    {
+      "epoch": 0.48675555555555555,
+      "grad_norm": 0.3638996318644696,
+      "learning_rate": 0.00010914251664523428,
+      "loss": 0.6206,
+      "step": 2738
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.36630154557599215,
+      "learning_rate": 0.00010908517577236302,
+      "loss": 0.6386,
+      "step": 2739
+    },
+    {
+      "epoch": 0.4871111111111111,
+      "grad_norm": 0.3445619840587946,
+      "learning_rate": 0.00010902783188728802,
+      "loss": 0.6043,
+      "step": 2740
+    },
+    {
+      "epoch": 0.4872888888888889,
+      "grad_norm": 0.38833044321790144,
+      "learning_rate": 0.00010897048500902172,
+      "loss": 0.6421,
+      "step": 2741
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.3553842396887497,
+      "learning_rate": 0.0001089131351565776,
+      "loss": 0.6306,
+      "step": 2742
+    },
+    {
+      "epoch": 0.48764444444444444,
+      "grad_norm": 0.3484606526479836,
+      "learning_rate": 0.00010885578234897003,
+      "loss": 0.6018,
+      "step": 2743
+    },
+    {
+      "epoch": 0.4878222222222222,
+      "grad_norm": 0.4045170295439436,
+      "learning_rate": 0.00010879842660521449,
+      "loss": 0.6049,
+      "step": 2744
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.361274390330694,
+      "learning_rate": 0.00010874106794432728,
+      "loss": 0.6552,
+      "step": 2745
+    },
+    {
+      "epoch": 0.48817777777777777,
+      "grad_norm": 0.37408216159252544,
+      "learning_rate": 0.00010868370638532582,
+      "loss": 0.599,
+      "step": 2746
+    },
+    {
+      "epoch": 0.48835555555555554,
+      "grad_norm": 0.37213300758748064,
+      "learning_rate": 0.00010862634194722839,
+      "loss": 0.5792,
+      "step": 2747
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.3560509110229911,
+      "learning_rate": 0.00010856897464905425,
+      "loss": 0.585,
+      "step": 2748
+    },
+    {
+      "epoch": 0.4887111111111111,
+      "grad_norm": 0.35152372742860466,
+      "learning_rate": 0.00010851160450982363,
+      "loss": 0.6065,
+      "step": 2749
+    },
+    {
+      "epoch": 0.4888888888888889,
+      "grad_norm": 0.3529930348619273,
+      "learning_rate": 0.0001084542315485577,
+      "loss": 0.6007,
+      "step": 2750
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.3579137296933873,
+      "learning_rate": 0.00010839685578427852,
+      "loss": 0.6185,
+      "step": 2751
+    },
+    {
+      "epoch": 0.4892444444444444,
+      "grad_norm": 0.3677083102081914,
+      "learning_rate": 0.00010833947723600913,
+      "loss": 0.6388,
+      "step": 2752
+    },
+    {
+      "epoch": 0.4894222222222222,
+      "grad_norm": 0.3497669374143786,
+      "learning_rate": 0.00010828209592277346,
+      "loss": 0.6451,
+      "step": 2753
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.36761661601408646,
+      "learning_rate": 0.00010822471186359639,
+      "loss": 0.6101,
+      "step": 2754
+    },
+    {
+      "epoch": 0.48977777777777776,
+      "grad_norm": 0.34399349729851153,
+      "learning_rate": 0.00010816732507750369,
+      "loss": 0.5736,
+      "step": 2755
+    },
+    {
+      "epoch": 0.48995555555555553,
+      "grad_norm": 0.39444123166385175,
+      "learning_rate": 0.00010810993558352202,
+      "loss": 0.6284,
+      "step": 2756
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.35128568212938327,
+      "learning_rate": 0.00010805254340067899,
+      "loss": 0.6484,
+      "step": 2757
+    },
+    {
+      "epoch": 0.4903111111111111,
+      "grad_norm": 0.36373459421377374,
+      "learning_rate": 0.00010799514854800298,
+      "loss": 0.6074,
+      "step": 2758
+    },
+    {
+      "epoch": 0.49048888888888886,
+      "grad_norm": 0.34565919628276104,
+      "learning_rate": 0.00010793775104452344,
+      "loss": 0.6426,
+      "step": 2759
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.36787727648529744,
+      "learning_rate": 0.00010788035090927053,
+      "loss": 0.6324,
+      "step": 2760
+    },
+    {
+      "epoch": 0.49084444444444447,
+      "grad_norm": 0.36484557219373004,
+      "learning_rate": 0.0001078229481612754,
+      "loss": 0.6387,
+      "step": 2761
+    },
+    {
+      "epoch": 0.49102222222222225,
+      "grad_norm": 0.35297798272843073,
+      "learning_rate": 0.00010776554281956998,
+      "loss": 0.5804,
+      "step": 2762
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.34499691049091197,
+      "learning_rate": 0.00010770813490318712,
+      "loss": 0.5713,
+      "step": 2763
+    },
+    {
+      "epoch": 0.4913777777777778,
+      "grad_norm": 0.34891630894582804,
+      "learning_rate": 0.00010765072443116049,
+      "loss": 0.6115,
+      "step": 2764
+    },
+    {
+      "epoch": 0.4915555555555556,
+      "grad_norm": 0.35138702833764945,
+      "learning_rate": 0.00010759331142252462,
+      "loss": 0.6197,
+      "step": 2765
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.38567865348514535,
+      "learning_rate": 0.0001075358958963149,
+      "loss": 0.6589,
+      "step": 2766
+    },
+    {
+      "epoch": 0.49191111111111113,
+      "grad_norm": 0.3407754970002096,
+      "learning_rate": 0.0001074784778715675,
+      "loss": 0.5985,
+      "step": 2767
+    },
+    {
+      "epoch": 0.4920888888888889,
+      "grad_norm": 0.3517842959008859,
+      "learning_rate": 0.00010742105736731947,
+      "loss": 0.6522,
+      "step": 2768
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.36403048140710736,
+      "learning_rate": 0.00010736363440260869,
+      "loss": 0.5974,
+      "step": 2769
+    },
+    {
+      "epoch": 0.49244444444444446,
+      "grad_norm": 0.36015957701572476,
+      "learning_rate": 0.00010730620899647379,
+      "loss": 0.6059,
+      "step": 2770
+    },
+    {
+      "epoch": 0.49262222222222224,
+      "grad_norm": 0.34803655189632937,
+      "learning_rate": 0.00010724878116795424,
+      "loss": 0.6267,
+      "step": 2771
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3463003479928638,
+      "learning_rate": 0.00010719135093609038,
+      "loss": 0.5481,
+      "step": 2772
+    },
+    {
+      "epoch": 0.4929777777777778,
+      "grad_norm": 0.35025933562783984,
+      "learning_rate": 0.00010713391831992323,
+      "loss": 0.5708,
+      "step": 2773
+    },
+    {
+      "epoch": 0.49315555555555557,
+      "grad_norm": 0.3693630903126469,
+      "learning_rate": 0.00010707648333849472,
+      "loss": 0.6437,
+      "step": 2774
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.35118765870843055,
+      "learning_rate": 0.00010701904601084745,
+      "loss": 0.6016,
+      "step": 2775
+    },
+    {
+      "epoch": 0.4935111111111111,
+      "grad_norm": 0.3372626141760626,
+      "learning_rate": 0.00010696160635602487,
+      "loss": 0.6101,
+      "step": 2776
+    },
+    {
+      "epoch": 0.4936888888888889,
+      "grad_norm": 0.35267664479676036,
+      "learning_rate": 0.00010690416439307122,
+      "loss": 0.5723,
+      "step": 2777
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.3697368831270203,
+      "learning_rate": 0.00010684672014103143,
+      "loss": 0.5874,
+      "step": 2778
+    },
+    {
+      "epoch": 0.49404444444444445,
+      "grad_norm": 0.3644349469458444,
+      "learning_rate": 0.00010678927361895124,
+      "loss": 0.6141,
+      "step": 2779
+    },
+    {
+      "epoch": 0.49422222222222223,
+      "grad_norm": 0.34684473843740704,
+      "learning_rate": 0.00010673182484587711,
+      "loss": 0.6372,
+      "step": 2780
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3350506295496847,
+      "learning_rate": 0.00010667437384085634,
+      "loss": 0.5779,
+      "step": 2781
+    },
+    {
+      "epoch": 0.4945777777777778,
+      "grad_norm": 0.35946078346449256,
+      "learning_rate": 0.00010661692062293682,
+      "loss": 0.6179,
+      "step": 2782
+    },
+    {
+      "epoch": 0.49475555555555556,
+      "grad_norm": 0.3689683408579485,
+      "learning_rate": 0.00010655946521116726,
+      "loss": 0.6505,
+      "step": 2783
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.370988792598542,
+      "learning_rate": 0.0001065020076245971,
+      "loss": 0.6249,
+      "step": 2784
+    },
+    {
+      "epoch": 0.4951111111111111,
+      "grad_norm": 0.36063430235058636,
+      "learning_rate": 0.0001064445478822765,
+      "loss": 0.6068,
+      "step": 2785
+    },
+    {
+      "epoch": 0.4952888888888889,
+      "grad_norm": 0.35720263045801315,
+      "learning_rate": 0.00010638708600325632,
+      "loss": 0.6222,
+      "step": 2786
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.38600134491333904,
+      "learning_rate": 0.00010632962200658815,
+      "loss": 0.6193,
+      "step": 2787
+    },
+    {
+      "epoch": 0.49564444444444444,
+      "grad_norm": 0.3538939390168749,
+      "learning_rate": 0.00010627215591132422,
+      "loss": 0.6089,
+      "step": 2788
+    },
+    {
+      "epoch": 0.4958222222222222,
+      "grad_norm": 0.35610930727412976,
+      "learning_rate": 0.00010621468773651755,
+      "loss": 0.6407,
+      "step": 2789
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3460646906483097,
+      "learning_rate": 0.00010615721750122177,
+      "loss": 0.5823,
+      "step": 2790
+    },
+    {
+      "epoch": 0.4961777777777778,
+      "grad_norm": 0.33362362823218344,
+      "learning_rate": 0.00010609974522449122,
+      "loss": 0.6216,
+      "step": 2791
+    },
+    {
+      "epoch": 0.49635555555555555,
+      "grad_norm": 0.36261281518354926,
+      "learning_rate": 0.00010604227092538095,
+      "loss": 0.6467,
+      "step": 2792
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.34603549890246943,
+      "learning_rate": 0.00010598479462294663,
+      "loss": 0.6229,
+      "step": 2793
+    },
+    {
+      "epoch": 0.4967111111111111,
+      "grad_norm": 0.34599693327445646,
+      "learning_rate": 0.0001059273163362446,
+      "loss": 0.6143,
+      "step": 2794
+    },
+    {
+      "epoch": 0.4968888888888889,
+      "grad_norm": 0.35513485757639185,
+      "learning_rate": 0.0001058698360843319,
+      "loss": 0.6642,
+      "step": 2795
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.3509477718058822,
+      "learning_rate": 0.00010581235388626618,
+      "loss": 0.6357,
+      "step": 2796
+    },
+    {
+      "epoch": 0.49724444444444443,
+      "grad_norm": 0.364396029448311,
+      "learning_rate": 0.00010575486976110575,
+      "loss": 0.598,
+      "step": 2797
+    },
+    {
+      "epoch": 0.4974222222222222,
+      "grad_norm": 0.3768475083816325,
+      "learning_rate": 0.00010569738372790956,
+      "loss": 0.5976,
+      "step": 2798
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3566503351383309,
+      "learning_rate": 0.00010563989580573719,
+      "loss": 0.6372,
+      "step": 2799
+    },
+    {
+      "epoch": 0.49777777777777776,
+      "grad_norm": 0.3452388377561649,
+      "learning_rate": 0.00010558240601364886,
+      "loss": 0.5963,
+      "step": 2800
+    },
+    {
+      "epoch": 0.49795555555555554,
+      "grad_norm": 0.34920378279486075,
+      "learning_rate": 0.00010552491437070537,
+      "loss": 0.6575,
+      "step": 2801
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.35188471236857155,
+      "learning_rate": 0.0001054674208959682,
+      "loss": 0.566,
+      "step": 2802
+    },
+    {
+      "epoch": 0.4983111111111111,
+      "grad_norm": 0.3492653990364668,
+      "learning_rate": 0.00010540992560849936,
+      "loss": 0.6316,
+      "step": 2803
+    },
+    {
+      "epoch": 0.49848888888888887,
+      "grad_norm": 0.36604260248841,
+      "learning_rate": 0.00010535242852736151,
+      "loss": 0.5899,
+      "step": 2804
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.36538487096263866,
+      "learning_rate": 0.00010529492967161794,
+      "loss": 0.6097,
+      "step": 2805
+    },
+    {
+      "epoch": 0.4988444444444444,
+      "grad_norm": 0.34839531813772273,
+      "learning_rate": 0.00010523742906033241,
+      "loss": 0.6215,
+      "step": 2806
+    },
+    {
+      "epoch": 0.4990222222222222,
+      "grad_norm": 0.33840736496823837,
+      "learning_rate": 0.00010517992671256937,
+      "loss": 0.6107,
+      "step": 2807
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3918809735177293,
+      "learning_rate": 0.00010512242264739381,
+      "loss": 0.6124,
+      "step": 2808
+    },
+    {
+      "epoch": 0.49937777777777775,
+      "grad_norm": 0.35773165944495616,
+      "learning_rate": 0.00010506491688387127,
+      "loss": 0.6215,
+      "step": 2809
+    },
+    {
+      "epoch": 0.49955555555555553,
+      "grad_norm": 0.33996554260270845,
+      "learning_rate": 0.0001050074094410679,
+      "loss": 0.6122,
+      "step": 2810
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.3517499100221005,
+      "learning_rate": 0.00010494990033805038,
+      "loss": 0.5956,
+      "step": 2811
+    },
+    {
+      "epoch": 0.4999111111111111,
+      "grad_norm": 0.33218122062584815,
+      "learning_rate": 0.00010489238959388592,
+      "loss": 0.5604,
+      "step": 2812
+    },
+    {
+      "epoch": 0.5000888888888889,
+      "grad_norm": 0.3498560816930892,
+      "learning_rate": 0.00010483487722764231,
+      "loss": 0.5927,
+      "step": 2813
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.3684901328729348,
+      "learning_rate": 0.00010477736325838785,
+      "loss": 0.6467,
+      "step": 2814
+    },
+    {
+      "epoch": 0.5004444444444445,
+      "grad_norm": 0.3667843096158526,
+      "learning_rate": 0.00010471984770519139,
+      "loss": 0.6199,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5006222222222222,
+      "grad_norm": 0.359214710059682,
+      "learning_rate": 0.00010466233058712229,
+      "loss": 0.608,
+      "step": 2816
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.3380932320111709,
+      "learning_rate": 0.00010460481192325045,
+      "loss": 0.5796,
+      "step": 2817
+    },
+    {
+      "epoch": 0.5009777777777777,
+      "grad_norm": 0.3707288954879967,
+      "learning_rate": 0.00010454729173264627,
+      "loss": 0.637,
+      "step": 2818
+    },
+    {
+      "epoch": 0.5011555555555556,
+      "grad_norm": 0.3300407135065756,
+      "learning_rate": 0.00010448977003438066,
+      "loss": 0.593,
+      "step": 2819
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.34950465708506095,
+      "learning_rate": 0.000104432246847525,
+      "loss": 0.5904,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5015111111111111,
+      "grad_norm": 0.34126426032560353,
+      "learning_rate": 0.00010437472219115119,
+      "loss": 0.5793,
+      "step": 2821
+    },
+    {
+      "epoch": 0.5016888888888889,
+      "grad_norm": 0.3356185281893816,
+      "learning_rate": 0.00010431719608433163,
+      "loss": 0.5864,
+      "step": 2822
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.4000250462032941,
+      "learning_rate": 0.00010425966854613922,
+      "loss": 0.564,
+      "step": 2823
+    },
+    {
+      "epoch": 0.5020444444444444,
+      "grad_norm": 0.33102759237268814,
+      "learning_rate": 0.00010420213959564726,
+      "loss": 0.612,
+      "step": 2824
+    },
+    {
+      "epoch": 0.5022222222222222,
+      "grad_norm": 0.35705141978540295,
+      "learning_rate": 0.00010414460925192957,
+      "loss": 0.6473,
+      "step": 2825
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3856557515811476,
+      "learning_rate": 0.00010408707753406041,
+      "loss": 0.6133,
+      "step": 2826
+    },
+    {
+      "epoch": 0.5025777777777778,
+      "grad_norm": 0.36849516824903794,
+      "learning_rate": 0.00010402954446111454,
+      "loss": 0.6982,
+      "step": 2827
+    },
+    {
+      "epoch": 0.5027555555555555,
+      "grad_norm": 0.34277234138188417,
+      "learning_rate": 0.00010397201005216712,
+      "loss": 0.5997,
+      "step": 2828
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.3715528396359596,
+      "learning_rate": 0.00010391447432629376,
+      "loss": 0.6771,
+      "step": 2829
+    },
+    {
+      "epoch": 0.5031111111111111,
+      "grad_norm": 0.3484179123443325,
+      "learning_rate": 0.00010385693730257055,
+      "loss": 0.632,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5032888888888889,
+      "grad_norm": 0.33654209290635734,
+      "learning_rate": 0.00010379939900007393,
+      "loss": 0.5752,
+      "step": 2831
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.3660390639476581,
+      "learning_rate": 0.00010374185943788084,
+      "loss": 0.6554,
+      "step": 2832
+    },
+    {
+      "epoch": 0.5036444444444445,
+      "grad_norm": 0.346019943877722,
+      "learning_rate": 0.0001036843186350686,
+      "loss": 0.5795,
+      "step": 2833
+    },
+    {
+      "epoch": 0.5038222222222222,
+      "grad_norm": 0.3566103197804138,
+      "learning_rate": 0.00010362677661071496,
+      "loss": 0.5937,
+      "step": 2834
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.35575117467178,
+      "learning_rate": 0.00010356923338389806,
+      "loss": 0.6202,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5041777777777777,
+      "grad_norm": 0.3441497215196966,
+      "learning_rate": 0.00010351168897369643,
+      "loss": 0.622,
+      "step": 2836
+    },
+    {
+      "epoch": 0.5043555555555556,
+      "grad_norm": 0.38114956937969147,
+      "learning_rate": 0.00010345414339918902,
+      "loss": 0.65,
+      "step": 2837
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.34563702273352565,
+      "learning_rate": 0.00010339659667945516,
+      "loss": 0.6272,
+      "step": 2838
+    },
+    {
+      "epoch": 0.5047111111111111,
+      "grad_norm": 0.3599459817591909,
+      "learning_rate": 0.00010333904883357455,
+      "loss": 0.6447,
+      "step": 2839
+    },
+    {
+      "epoch": 0.5048888888888889,
+      "grad_norm": 0.359921736087741,
+      "learning_rate": 0.00010328149988062724,
+      "loss": 0.6133,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.3586654850163382,
+      "learning_rate": 0.00010322394983969368,
+      "loss": 0.6287,
+      "step": 2841
+    },
+    {
+      "epoch": 0.5052444444444445,
+      "grad_norm": 0.34841974771113626,
+      "learning_rate": 0.00010316639872985472,
+      "loss": 0.6259,
+      "step": 2842
+    },
+    {
+      "epoch": 0.5054222222222222,
+      "grad_norm": 0.36272827256722956,
+      "learning_rate": 0.00010310884657019146,
+      "loss": 0.5888,
+      "step": 2843
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.35419453750164864,
+      "learning_rate": 0.00010305129337978543,
+      "loss": 0.6025,
+      "step": 2844
+    },
+    {
+      "epoch": 0.5057777777777778,
+      "grad_norm": 0.3610755239552308,
+      "learning_rate": 0.00010299373917771846,
+      "loss": 0.6225,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5059555555555556,
+      "grad_norm": 0.35206729828008626,
+      "learning_rate": 0.00010293618398307276,
+      "loss": 0.5734,
+      "step": 2846
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.45376085976884545,
+      "learning_rate": 0.00010287862781493081,
+      "loss": 0.6708,
+      "step": 2847
+    },
+    {
+      "epoch": 0.5063111111111112,
+      "grad_norm": 0.36643314471592,
+      "learning_rate": 0.00010282107069237548,
+      "loss": 0.5756,
+      "step": 2848
+    },
+    {
+      "epoch": 0.5064888888888889,
+      "grad_norm": 0.3357220441033354,
+      "learning_rate": 0.00010276351263448989,
+      "loss": 0.6114,
+      "step": 2849
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.3521334800412782,
+      "learning_rate": 0.00010270595366035751,
+      "loss": 0.619,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5068444444444444,
+      "grad_norm": 0.3434862798781897,
+      "learning_rate": 0.0001026483937890621,
+      "loss": 0.6065,
+      "step": 2851
+    },
+    {
+      "epoch": 0.5070222222222223,
+      "grad_norm": 0.3704597964796915,
+      "learning_rate": 0.00010259083303968775,
+      "loss": 0.6555,
+      "step": 2852
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3631273288179781,
+      "learning_rate": 0.00010253327143131879,
+      "loss": 0.6377,
+      "step": 2853
+    },
+    {
+      "epoch": 0.5073777777777778,
+      "grad_norm": 0.3436159999082076,
+      "learning_rate": 0.00010247570898303986,
+      "loss": 0.5805,
+      "step": 2854
+    },
+    {
+      "epoch": 0.5075555555555555,
+      "grad_norm": 0.36449824920003177,
+      "learning_rate": 0.0001024181457139359,
+      "loss": 0.6289,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.3564461569146345,
+      "learning_rate": 0.00010236058164309205,
+      "loss": 0.5974,
+      "step": 2856
+    },
+    {
+      "epoch": 0.5079111111111111,
+      "grad_norm": 0.3624876331423114,
+      "learning_rate": 0.0001023030167895938,
+      "loss": 0.6455,
+      "step": 2857
+    },
+    {
+      "epoch": 0.5080888888888889,
+      "grad_norm": 0.4065282276465651,
+      "learning_rate": 0.00010224545117252686,
+      "loss": 0.6389,
+      "step": 2858
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.355295615845016,
+      "learning_rate": 0.00010218788481097719,
+      "loss": 0.6086,
+      "step": 2859
+    },
+    {
+      "epoch": 0.5084444444444445,
+      "grad_norm": 0.36008478013804707,
+      "learning_rate": 0.00010213031772403099,
+      "loss": 0.5975,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5086222222222222,
+      "grad_norm": 0.361608065323148,
+      "learning_rate": 0.00010207274993077475,
+      "loss": 0.5939,
+      "step": 2861
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.37737679440277544,
+      "learning_rate": 0.00010201518145029514,
+      "loss": 0.6516,
+      "step": 2862
+    },
+    {
+      "epoch": 0.5089777777777778,
+      "grad_norm": 0.41062655534356596,
+      "learning_rate": 0.00010195761230167906,
+      "loss": 0.6797,
+      "step": 2863
+    },
+    {
+      "epoch": 0.5091555555555556,
+      "grad_norm": 0.3812547547124469,
+      "learning_rate": 0.00010190004250401368,
+      "loss": 0.63,
+      "step": 2864
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.3411389261481799,
+      "learning_rate": 0.00010184247207638636,
+      "loss": 0.6364,
+      "step": 2865
+    },
+    {
+      "epoch": 0.5095111111111111,
+      "grad_norm": 0.3372784137913383,
+      "learning_rate": 0.0001017849010378846,
+      "loss": 0.5785,
+      "step": 2866
+    },
+    {
+      "epoch": 0.5096888888888889,
+      "grad_norm": 0.3535829710930523,
+      "learning_rate": 0.00010172732940759626,
+      "loss": 0.6458,
+      "step": 2867
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.3626486580503156,
+      "learning_rate": 0.0001016697572046092,
+      "loss": 0.6038,
+      "step": 2868
+    },
+    {
+      "epoch": 0.5100444444444444,
+      "grad_norm": 0.3627595777778218,
+      "learning_rate": 0.00010161218444801164,
+      "loss": 0.6395,
+      "step": 2869
+    },
+    {
+      "epoch": 0.5102222222222222,
+      "grad_norm": 0.3539135083911791,
+      "learning_rate": 0.00010155461115689187,
+      "loss": 0.6391,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.34235740406145715,
+      "learning_rate": 0.00010149703735033845,
+      "loss": 0.6297,
+      "step": 2871
+    },
+    {
+      "epoch": 0.5105777777777778,
+      "grad_norm": 0.357642403952184,
+      "learning_rate": 0.00010143946304744001,
+      "loss": 0.6471,
+      "step": 2872
+    },
+    {
+      "epoch": 0.5107555555555555,
+      "grad_norm": 0.35778765198668405,
+      "learning_rate": 0.00010138188826728543,
+      "loss": 0.6478,
+      "step": 2873
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.3695782303343061,
+      "learning_rate": 0.00010132431302896372,
+      "loss": 0.5901,
+      "step": 2874
+    },
+    {
+      "epoch": 0.5111111111111111,
+      "grad_norm": 0.38786672316122284,
+      "learning_rate": 0.00010126673735156402,
+      "loss": 0.5878,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5112888888888889,
+      "grad_norm": 0.34095507834884853,
+      "learning_rate": 0.00010120916125417563,
+      "loss": 0.5731,
+      "step": 2876
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.3271501522369416,
+      "learning_rate": 0.00010115158475588799,
+      "loss": 0.5888,
+      "step": 2877
+    },
+    {
+      "epoch": 0.5116444444444445,
+      "grad_norm": 0.365155829963744,
+      "learning_rate": 0.00010109400787579071,
+      "loss": 0.58,
+      "step": 2878
+    },
+    {
+      "epoch": 0.5118222222222222,
+      "grad_norm": 0.35205320550146113,
+      "learning_rate": 0.00010103643063297348,
+      "loss": 0.6206,
+      "step": 2879
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.34403296207257006,
+      "learning_rate": 0.0001009788530465261,
+      "loss": 0.6104,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5121777777777777,
+      "grad_norm": 0.3485221769171977,
+      "learning_rate": 0.0001009212751355385,
+      "loss": 0.6093,
+      "step": 2881
+    },
+    {
+      "epoch": 0.5123555555555556,
+      "grad_norm": 0.33122769821530423,
+      "learning_rate": 0.00010086369691910073,
+      "loss": 0.5772,
+      "step": 2882
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.3555973359000505,
+      "learning_rate": 0.00010080611841630296,
+      "loss": 0.5852,
+      "step": 2883
+    },
+    {
+      "epoch": 0.5127111111111111,
+      "grad_norm": 0.365659490986573,
+      "learning_rate": 0.0001007485396462354,
+      "loss": 0.5631,
+      "step": 2884
+    },
+    {
+      "epoch": 0.5128888888888888,
+      "grad_norm": 0.35833070480218726,
+      "learning_rate": 0.0001006909606279884,
+      "loss": 0.6433,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.35166222102821754,
+      "learning_rate": 0.00010063338138065234,
+      "loss": 0.6239,
+      "step": 2886
+    },
+    {
+      "epoch": 0.5132444444444444,
+      "grad_norm": 0.3718176392604159,
+      "learning_rate": 0.00010057580192331775,
+      "loss": 0.6576,
+      "step": 2887
+    },
+    {
+      "epoch": 0.5134222222222222,
+      "grad_norm": 0.961036915007125,
+      "learning_rate": 0.00010051822227507515,
+      "loss": 0.643,
+      "step": 2888
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.3862831022927344,
+      "learning_rate": 0.00010046064245501518,
+      "loss": 0.6133,
+      "step": 2889
+    },
+    {
+      "epoch": 0.5137777777777778,
+      "grad_norm": 0.34797681350138393,
+      "learning_rate": 0.0001004030624822285,
+      "loss": 0.6079,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5139555555555556,
+      "grad_norm": 0.38759151148497084,
+      "learning_rate": 0.00010034548237580587,
+      "loss": 0.6264,
+      "step": 2891
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.3610477038546272,
+      "learning_rate": 0.00010028790215483803,
+      "loss": 0.6229,
+      "step": 2892
+    },
+    {
+      "epoch": 0.5143111111111112,
+      "grad_norm": 0.3814255721295432,
+      "learning_rate": 0.00010023032183841579,
+      "loss": 0.6003,
+      "step": 2893
+    },
+    {
+      "epoch": 0.5144888888888889,
+      "grad_norm": 0.37499587100800164,
+      "learning_rate": 0.00010017274144562998,
+      "loss": 0.6282,
+      "step": 2894
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.3701775558107347,
+      "learning_rate": 0.0001001151609955715,
+      "loss": 0.5972,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5148444444444444,
+      "grad_norm": 0.3852048164158099,
+      "learning_rate": 0.0001000575805073312,
+      "loss": 0.6401,
+      "step": 2896
+    },
+    {
+      "epoch": 0.5150222222222223,
+      "grad_norm": 0.336630404537959,
+      "learning_rate": 0.0001,
+      "loss": 0.5556,
+      "step": 2897
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3779473174849211,
+      "learning_rate": 9.994241949266879e-05,
+      "loss": 0.6195,
+      "step": 2898
+    },
+    {
+      "epoch": 0.5153777777777778,
+      "grad_norm": 0.35868374198402836,
+      "learning_rate": 9.988483900442854e-05,
+      "loss": 0.6452,
+      "step": 2899
+    },
+    {
+      "epoch": 0.5155555555555555,
+      "grad_norm": 0.4004543199994003,
+      "learning_rate": 9.982725855437002e-05,
+      "loss": 0.6886,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.35747016513202784,
+      "learning_rate": 9.976967816158423e-05,
+      "loss": 0.6571,
+      "step": 2901
+    },
+    {
+      "epoch": 0.5159111111111111,
+      "grad_norm": 0.36712045139323474,
+      "learning_rate": 9.9712097845162e-05,
+      "loss": 0.6257,
+      "step": 2902
+    },
+    {
+      "epoch": 0.5160888888888889,
+      "grad_norm": 0.3446530348167064,
+      "learning_rate": 9.965451762419415e-05,
+      "loss": 0.6235,
+      "step": 2903
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.37947442831733524,
+      "learning_rate": 9.959693751777149e-05,
+      "loss": 0.6448,
+      "step": 2904
+    },
+    {
+      "epoch": 0.5164444444444445,
+      "grad_norm": 0.35469199732020873,
+      "learning_rate": 9.953935754498484e-05,
+      "loss": 0.6033,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5166222222222222,
+      "grad_norm": 0.3952447820031589,
+      "learning_rate": 9.948177772492484e-05,
+      "loss": 0.6094,
+      "step": 2906
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3609029313848285,
+      "learning_rate": 9.942419807668227e-05,
+      "loss": 0.6478,
+      "step": 2907
+    },
+    {
+      "epoch": 0.5169777777777778,
+      "grad_norm": 0.391285828099458,
+      "learning_rate": 9.936661861934765e-05,
+      "loss": 0.5886,
+      "step": 2908
+    },
+    {
+      "epoch": 0.5171555555555556,
+      "grad_norm": 0.3460556556219249,
+      "learning_rate": 9.930903937201163e-05,
+      "loss": 0.5819,
+      "step": 2909
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.3636055320598646,
+      "learning_rate": 9.925146035376459e-05,
+      "loss": 0.582,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5175111111111111,
+      "grad_norm": 0.360552208979026,
+      "learning_rate": 9.919388158369707e-05,
+      "loss": 0.6045,
+      "step": 2911
+    },
+    {
+      "epoch": 0.5176888888888889,
+      "grad_norm": 0.3737646187553946,
+      "learning_rate": 9.913630308089927e-05,
+      "loss": 0.6196,
+      "step": 2912
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.35756137908178914,
+      "learning_rate": 9.907872486446152e-05,
+      "loss": 0.6161,
+      "step": 2913
+    },
+    {
+      "epoch": 0.5180444444444444,
+      "grad_norm": 0.365391892951095,
+      "learning_rate": 9.902114695347393e-05,
+      "loss": 0.6237,
+      "step": 2914
+    },
+    {
+      "epoch": 0.5182222222222223,
+      "grad_norm": 0.35831353292325624,
+      "learning_rate": 9.896356936702653e-05,
+      "loss": 0.6069,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.35526902761971624,
+      "learning_rate": 9.890599212420927e-05,
+      "loss": 0.6295,
+      "step": 2916
+    },
+    {
+      "epoch": 0.5185777777777778,
+      "grad_norm": 0.3623638101671441,
+      "learning_rate": 9.884841524411202e-05,
+      "loss": 0.6414,
+      "step": 2917
+    },
+    {
+      "epoch": 0.5187555555555555,
+      "grad_norm": 0.35417122316719685,
+      "learning_rate": 9.879083874582438e-05,
+      "loss": 0.5746,
+      "step": 2918
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.38381388572606173,
+      "learning_rate": 9.8733262648436e-05,
+      "loss": 0.6285,
+      "step": 2919
+    },
+    {
+      "epoch": 0.5191111111111111,
+      "grad_norm": 0.3522278048435224,
+      "learning_rate": 9.867568697103629e-05,
+      "loss": 0.6175,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5192888888888889,
+      "grad_norm": 0.4180580565329305,
+      "learning_rate": 9.861811173271459e-05,
+      "loss": 0.6264,
+      "step": 2921
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.37354074873377613,
+      "learning_rate": 9.856053695255999e-05,
+      "loss": 0.6275,
+      "step": 2922
+    },
+    {
+      "epoch": 0.5196444444444445,
+      "grad_norm": 0.3758961247515681,
+      "learning_rate": 9.850296264966159e-05,
+      "loss": 0.6361,
+      "step": 2923
+    },
+    {
+      "epoch": 0.5198222222222222,
+      "grad_norm": 0.34017103126500436,
+      "learning_rate": 9.844538884310813e-05,
+      "loss": 0.6092,
+      "step": 2924
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.35844883460816734,
+      "learning_rate": 9.838781555198839e-05,
+      "loss": 0.6058,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5201777777777777,
+      "grad_norm": 0.47351394790219103,
+      "learning_rate": 9.833024279539081e-05,
+      "loss": 0.6503,
+      "step": 2926
+    },
+    {
+      "epoch": 0.5203555555555556,
+      "grad_norm": 0.35037283404698394,
+      "learning_rate": 9.827267059240377e-05,
+      "loss": 0.6606,
+      "step": 2927
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.3526269390644446,
+      "learning_rate": 9.821509896211539e-05,
+      "loss": 0.6236,
+      "step": 2928
+    },
+    {
+      "epoch": 0.5207111111111111,
+      "grad_norm": 0.35322818435064757,
+      "learning_rate": 9.815752792361368e-05,
+      "loss": 0.5848,
+      "step": 2929
+    },
+    {
+      "epoch": 0.5208888888888888,
+      "grad_norm": 0.3539780920742349,
+      "learning_rate": 9.809995749598632e-05,
+      "loss": 0.5857,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.35744938734845616,
+      "learning_rate": 9.804238769832095e-05,
+      "loss": 0.6431,
+      "step": 2931
+    },
+    {
+      "epoch": 0.5212444444444444,
+      "grad_norm": 0.3607043617509548,
+      "learning_rate": 9.798481854970485e-05,
+      "loss": 0.6193,
+      "step": 2932
+    },
+    {
+      "epoch": 0.5214222222222222,
+      "grad_norm": 0.36731672479324823,
+      "learning_rate": 9.792725006922527e-05,
+      "loss": 0.6384,
+      "step": 2933
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.35196149132551974,
+      "learning_rate": 9.7869682275969e-05,
+      "loss": 0.5745,
+      "step": 2934
+    },
+    {
+      "epoch": 0.5217777777777778,
+      "grad_norm": 0.4007814820710728,
+      "learning_rate": 9.781211518902285e-05,
+      "loss": 0.6539,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5219555555555555,
+      "grad_norm": 0.39414427988381906,
+      "learning_rate": 9.775454882747315e-05,
+      "loss": 0.6302,
+      "step": 2936
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.36894733246837,
+      "learning_rate": 9.769698321040622e-05,
+      "loss": 0.6474,
+      "step": 2937
+    },
+    {
+      "epoch": 0.5223111111111111,
+      "grad_norm": 0.3882822725995771,
+      "learning_rate": 9.763941835690796e-05,
+      "loss": 0.6397,
+      "step": 2938
+    },
+    {
+      "epoch": 0.5224888888888889,
+      "grad_norm": 0.38523492453561436,
+      "learning_rate": 9.758185428606412e-05,
+      "loss": 0.6075,
+      "step": 2939
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.4057877666168052,
+      "learning_rate": 9.752429101696013e-05,
+      "loss": 0.5862,
+      "step": 2940
+    },
+    {
+      "epoch": 0.5228444444444444,
+      "grad_norm": 0.34496439990568606,
+      "learning_rate": 9.746672856868123e-05,
+      "loss": 0.5848,
+      "step": 2941
+    },
+    {
+      "epoch": 0.5230222222222223,
+      "grad_norm": 0.34486492247772954,
+      "learning_rate": 9.740916696031225e-05,
+      "loss": 0.614,
+      "step": 2942
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.3235033304298466,
+      "learning_rate": 9.73516062109379e-05,
+      "loss": 0.5876,
+      "step": 2943
+    },
+    {
+      "epoch": 0.5233777777777778,
+      "grad_norm": 0.3575551506137171,
+      "learning_rate": 9.729404633964248e-05,
+      "loss": 0.5979,
+      "step": 2944
+    },
+    {
+      "epoch": 0.5235555555555556,
+      "grad_norm": 0.3599655231583651,
+      "learning_rate": 9.723648736551015e-05,
+      "loss": 0.6476,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.3656483567812726,
+      "learning_rate": 9.717892930762453e-05,
+      "loss": 0.6267,
+      "step": 2946
+    },
+    {
+      "epoch": 0.5239111111111111,
+      "grad_norm": 0.379213754725316,
+      "learning_rate": 9.71213721850692e-05,
+      "loss": 0.5718,
+      "step": 2947
+    },
+    {
+      "epoch": 0.5240888888888889,
+      "grad_norm": 0.3730532621707509,
+      "learning_rate": 9.706381601692725e-05,
+      "loss": 0.6248,
+      "step": 2948
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.3432656977391376,
+      "learning_rate": 9.700626082228156e-05,
+      "loss": 0.5671,
+      "step": 2949
+    },
+    {
+      "epoch": 0.5244444444444445,
+      "grad_norm": 0.38912488739395096,
+      "learning_rate": 9.694870662021459e-05,
+      "loss": 0.6183,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5246222222222222,
+      "grad_norm": 0.35469745042207806,
+      "learning_rate": 9.689115342980856e-05,
+      "loss": 0.5806,
+      "step": 2951
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.35818278335217285,
+      "learning_rate": 9.683360127014529e-05,
+      "loss": 0.638,
+      "step": 2952
+    },
+    {
+      "epoch": 0.5249777777777778,
+      "grad_norm": 0.3501042752256769,
+      "learning_rate": 9.677605016030632e-05,
+      "loss": 0.5815,
+      "step": 2953
+    },
+    {
+      "epoch": 0.5251555555555556,
+      "grad_norm": 0.3931154270394189,
+      "learning_rate": 9.671850011937277e-05,
+      "loss": 0.6054,
+      "step": 2954
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.35099574769864955,
+      "learning_rate": 9.666095116642549e-05,
+      "loss": 0.6183,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5255111111111112,
+      "grad_norm": 0.3771109287968403,
+      "learning_rate": 9.660340332054483e-05,
+      "loss": 0.6231,
+      "step": 2956
+    },
+    {
+      "epoch": 0.5256888888888889,
+      "grad_norm": 0.33478556952210486,
+      "learning_rate": 9.654585660081099e-05,
+      "loss": 0.6358,
+      "step": 2957
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.3610180396758555,
+      "learning_rate": 9.648831102630356e-05,
+      "loss": 0.6042,
+      "step": 2958
+    },
+    {
+      "epoch": 0.5260444444444444,
+      "grad_norm": 0.3818628723386374,
+      "learning_rate": 9.643076661610196e-05,
+      "loss": 0.5794,
+      "step": 2959
+    },
+    {
+      "epoch": 0.5262222222222223,
+      "grad_norm": 0.37561556756123304,
+      "learning_rate": 9.637322338928504e-05,
+      "loss": 0.639,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.3495583032963642,
+      "learning_rate": 9.631568136493142e-05,
+      "loss": 0.6032,
+      "step": 2961
+    },
+    {
+      "epoch": 0.5265777777777778,
+      "grad_norm": 0.35731148939516844,
+      "learning_rate": 9.625814056211918e-05,
+      "loss": 0.596,
+      "step": 2962
+    },
+    {
+      "epoch": 0.5267555555555555,
+      "grad_norm": 0.35354696768364813,
+      "learning_rate": 9.620060099992609e-05,
+      "loss": 0.5984,
+      "step": 2963
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.3601969735002162,
+      "learning_rate": 9.614306269742947e-05,
+      "loss": 0.6014,
+      "step": 2964
+    },
+    {
+      "epoch": 0.5271111111111111,
+      "grad_norm": 0.34677588890476424,
+      "learning_rate": 9.608552567370626e-05,
+      "loss": 0.6196,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5272888888888889,
+      "grad_norm": 0.34555670822742307,
+      "learning_rate": 9.602798994783289e-05,
+      "loss": 0.6309,
+      "step": 2966
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.3430181957042666,
+      "learning_rate": 9.597045553888548e-05,
+      "loss": 0.6128,
+      "step": 2967
+    },
+    {
+      "epoch": 0.5276444444444445,
+      "grad_norm": 0.347476669007943,
+      "learning_rate": 9.591292246593958e-05,
+      "loss": 0.5784,
+      "step": 2968
+    },
+    {
+      "epoch": 0.5278222222222222,
+      "grad_norm": 0.353854717521372,
+      "learning_rate": 9.585539074807047e-05,
+      "loss": 0.5561,
+      "step": 2969
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.35783471157217955,
+      "learning_rate": 9.579786040435275e-05,
+      "loss": 0.6015,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5281777777777777,
+      "grad_norm": 0.3686395304174823,
+      "learning_rate": 9.574033145386079e-05,
+      "loss": 0.6185,
+      "step": 2971
+    },
+    {
+      "epoch": 0.5283555555555556,
+      "grad_norm": 0.3947530478590641,
+      "learning_rate": 9.568280391566835e-05,
+      "loss": 0.6312,
+      "step": 2972
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.3675404481285992,
+      "learning_rate": 9.562527780884884e-05,
+      "loss": 0.6026,
+      "step": 2973
+    },
+    {
+      "epoch": 0.5287111111111111,
+      "grad_norm": 0.3547325153181744,
+      "learning_rate": 9.556775315247501e-05,
+      "loss": 0.6473,
+      "step": 2974
+    },
+    {
+      "epoch": 0.5288888888888889,
+      "grad_norm": 0.3581767017880835,
+      "learning_rate": 9.551022996561937e-05,
+      "loss": 0.5857,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.3288979979584883,
+      "learning_rate": 9.545270826735374e-05,
+      "loss": 0.5593,
+      "step": 2976
+    },
+    {
+      "epoch": 0.5292444444444444,
+      "grad_norm": 0.34967501780154436,
+      "learning_rate": 9.539518807674957e-05,
+      "loss": 0.6057,
+      "step": 2977
+    },
+    {
+      "epoch": 0.5294222222222222,
+      "grad_norm": 0.37073253484684826,
+      "learning_rate": 9.533766941287771e-05,
+      "loss": 0.6048,
+      "step": 2978
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.37955417288277576,
+      "learning_rate": 9.528015229480864e-05,
+      "loss": 0.652,
+      "step": 2979
+    },
+    {
+      "epoch": 0.5297777777777778,
+      "grad_norm": 0.3873359186323274,
+      "learning_rate": 9.522263674161215e-05,
+      "loss": 0.6439,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5299555555555555,
+      "grad_norm": 0.3820254078051218,
+      "learning_rate": 9.516512277235771e-05,
+      "loss": 0.6427,
+      "step": 2981
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.35265162052968785,
+      "learning_rate": 9.510761040611406e-05,
+      "loss": 0.5435,
+      "step": 2982
+    },
+    {
+      "epoch": 0.5303111111111111,
+      "grad_norm": 0.37053788926145836,
+      "learning_rate": 9.505009966194964e-05,
+      "loss": 0.5894,
+      "step": 2983
+    },
+    {
+      "epoch": 0.5304888888888889,
+      "grad_norm": 0.33521735108513867,
+      "learning_rate": 9.499259055893208e-05,
+      "loss": 0.5472,
+      "step": 2984
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.3646828974786125,
+      "learning_rate": 9.493508311612874e-05,
+      "loss": 0.633,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5308444444444445,
+      "grad_norm": 0.37056438526004687,
+      "learning_rate": 9.48775773526062e-05,
+      "loss": 0.603,
+      "step": 2986
+    },
+    {
+      "epoch": 0.5310222222222222,
+      "grad_norm": 0.37787245926577895,
+      "learning_rate": 9.482007328743065e-05,
+      "loss": 0.6098,
+      "step": 2987
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.36488837379231537,
+      "learning_rate": 9.47625709396676e-05,
+      "loss": 0.5894,
+      "step": 2988
+    },
+    {
+      "epoch": 0.5313777777777777,
+      "grad_norm": 0.35361590777252105,
+      "learning_rate": 9.470507032838208e-05,
+      "loss": 0.6264,
+      "step": 2989
+    },
+    {
+      "epoch": 0.5315555555555556,
+      "grad_norm": 0.35393507601515406,
+      "learning_rate": 9.464757147263849e-05,
+      "loss": 0.5819,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.3648103392845752,
+      "learning_rate": 9.459007439150066e-05,
+      "loss": 0.6362,
+      "step": 2991
+    },
+    {
+      "epoch": 0.5319111111111111,
+      "grad_norm": 0.43035577017104076,
+      "learning_rate": 9.45325791040318e-05,
+      "loss": 0.5852,
+      "step": 2992
+    },
+    {
+      "epoch": 0.5320888888888888,
+      "grad_norm": 0.35528654638984675,
+      "learning_rate": 9.447508562929465e-05,
+      "loss": 0.5984,
+      "step": 2993
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.363727616042198,
+      "learning_rate": 9.441759398635115e-05,
+      "loss": 0.6244,
+      "step": 2994
+    },
+    {
+      "epoch": 0.5324444444444445,
+      "grad_norm": 0.34812432126185633,
+      "learning_rate": 9.436010419426283e-05,
+      "loss": 0.6041,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5326222222222222,
+      "grad_norm": 0.3844321186212034,
+      "learning_rate": 9.430261627209044e-05,
+      "loss": 0.6444,
+      "step": 2996
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3362309205097066,
+      "learning_rate": 9.424513023889427e-05,
+      "loss": 0.596,
+      "step": 2997
+    },
+    {
+      "epoch": 0.5329777777777778,
+      "grad_norm": 0.36276629116989384,
+      "learning_rate": 9.418764611373382e-05,
+      "loss": 0.6008,
+      "step": 2998
+    },
+    {
+      "epoch": 0.5331555555555556,
+      "grad_norm": 0.3356011124921123,
+      "learning_rate": 9.413016391566813e-05,
+      "loss": 0.6017,
+      "step": 2999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.3428435844038877,
+      "learning_rate": 9.407268366375541e-05,
+      "loss": 0.5736,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5335111111111112,
+      "grad_norm": 0.3745312609198618,
+      "learning_rate": 9.401520537705339e-05,
+      "loss": 0.6146,
+      "step": 3001
+    },
+    {
+      "epoch": 0.5336888888888889,
+      "grad_norm": 0.34099990359193805,
+      "learning_rate": 9.395772907461906e-05,
+      "loss": 0.576,
+      "step": 3002
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.3516542656742074,
+      "learning_rate": 9.39002547755088e-05,
+      "loss": 0.6039,
+      "step": 3003
+    },
+    {
+      "epoch": 0.5340444444444444,
+      "grad_norm": 0.35598025192935656,
+      "learning_rate": 9.384278249877823e-05,
+      "loss": 0.6539,
+      "step": 3004
+    },
+    {
+      "epoch": 0.5342222222222223,
+      "grad_norm": 0.3539822890750942,
+      "learning_rate": 9.378531226348247e-05,
+      "loss": 0.6625,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3944789672267754,
+      "learning_rate": 9.372784408867577e-05,
+      "loss": 0.6594,
+      "step": 3006
+    },
+    {
+      "epoch": 0.5345777777777778,
+      "grad_norm": 0.3627464888858326,
+      "learning_rate": 9.367037799341187e-05,
+      "loss": 0.6153,
+      "step": 3007
+    },
+    {
+      "epoch": 0.5347555555555555,
+      "grad_norm": 0.3605039588772481,
+      "learning_rate": 9.361291399674367e-05,
+      "loss": 0.6531,
+      "step": 3008
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.3765641551584363,
+      "learning_rate": 9.35554521177235e-05,
+      "loss": 0.6389,
+      "step": 3009
+    },
+    {
+      "epoch": 0.5351111111111111,
+      "grad_norm": 0.3485758876463875,
+      "learning_rate": 9.349799237540288e-05,
+      "loss": 0.6145,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5352888888888889,
+      "grad_norm": 0.6935716907372586,
+      "learning_rate": 9.344053478883277e-05,
+      "loss": 0.6376,
+      "step": 3011
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.3492611561333636,
+      "learning_rate": 9.338307937706321e-05,
+      "loss": 0.6033,
+      "step": 3012
+    },
+    {
+      "epoch": 0.5356444444444445,
+      "grad_norm": 0.34455975255882687,
+      "learning_rate": 9.332562615914368e-05,
+      "loss": 0.6322,
+      "step": 3013
+    },
+    {
+      "epoch": 0.5358222222222222,
+      "grad_norm": 0.3441906225206993,
+      "learning_rate": 9.326817515412287e-05,
+      "loss": 0.6155,
+      "step": 3014
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4294715417975386,
+      "learning_rate": 9.321072638104879e-05,
+      "loss": 0.5586,
+      "step": 3015
+    },
+    {
+      "epoch": 0.5361777777777778,
+      "grad_norm": 0.37452621132649355,
+      "learning_rate": 9.315327985896857e-05,
+      "loss": 0.6203,
+      "step": 3016
+    },
+    {
+      "epoch": 0.5363555555555556,
+      "grad_norm": 0.3631902743294269,
+      "learning_rate": 9.30958356069288e-05,
+      "loss": 0.5975,
+      "step": 3017
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.37337044984881307,
+      "learning_rate": 9.303839364397511e-05,
+      "loss": 0.6122,
+      "step": 3018
+    },
+    {
+      "epoch": 0.5367111111111111,
+      "grad_norm": 0.3399066713893503,
+      "learning_rate": 9.298095398915256e-05,
+      "loss": 0.6118,
+      "step": 3019
+    },
+    {
+      "epoch": 0.5368888888888889,
+      "grad_norm": 0.34556326993052,
+      "learning_rate": 9.292351666150528e-05,
+      "loss": 0.6076,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.3545040718126656,
+      "learning_rate": 9.286608168007678e-05,
+      "loss": 0.6236,
+      "step": 3021
+    },
+    {
+      "epoch": 0.5372444444444444,
+      "grad_norm": 0.33769307856844244,
+      "learning_rate": 9.280864906390963e-05,
+      "loss": 0.5957,
+      "step": 3022
+    },
+    {
+      "epoch": 0.5374222222222222,
+      "grad_norm": 0.3559084304526048,
+      "learning_rate": 9.275121883204577e-05,
+      "loss": 0.6206,
+      "step": 3023
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.34373779963744044,
+      "learning_rate": 9.269379100352624e-05,
+      "loss": 0.6094,
+      "step": 3024
+    },
+    {
+      "epoch": 0.5377777777777778,
+      "grad_norm": 0.3518854392861585,
+      "learning_rate": 9.263636559739132e-05,
+      "loss": 0.5657,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5379555555555555,
+      "grad_norm": 0.343681953536367,
+      "learning_rate": 9.257894263268054e-05,
+      "loss": 0.5821,
+      "step": 3026
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.357766217489286,
+      "learning_rate": 9.252152212843252e-05,
+      "loss": 0.6149,
+      "step": 3027
+    },
+    {
+      "epoch": 0.5383111111111111,
+      "grad_norm": 0.35798739109935196,
+      "learning_rate": 9.24641041036851e-05,
+      "loss": 0.6395,
+      "step": 3028
+    },
+    {
+      "epoch": 0.5384888888888889,
+      "grad_norm": 0.3535057450281119,
+      "learning_rate": 9.24066885774754e-05,
+      "loss": 0.5945,
+      "step": 3029
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.36059981864387025,
+      "learning_rate": 9.23492755688395e-05,
+      "loss": 0.5919,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5388444444444445,
+      "grad_norm": 0.35602087109642055,
+      "learning_rate": 9.22918650968129e-05,
+      "loss": 0.6057,
+      "step": 3031
+    },
+    {
+      "epoch": 0.5390222222222222,
+      "grad_norm": 0.355986237405648,
+      "learning_rate": 9.223445718043001e-05,
+      "loss": 0.6122,
+      "step": 3032
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.33096217158480706,
+      "learning_rate": 9.217705183872462e-05,
+      "loss": 0.5677,
+      "step": 3033
+    },
+    {
+      "epoch": 0.5393777777777777,
+      "grad_norm": 0.3382042520643305,
+      "learning_rate": 9.211964909072945e-05,
+      "loss": 0.6099,
+      "step": 3034
+    },
+    {
+      "epoch": 0.5395555555555556,
+      "grad_norm": 0.3787502846974754,
+      "learning_rate": 9.206224895547658e-05,
+      "loss": 0.6299,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.3586806414897177,
+      "learning_rate": 9.200485145199704e-05,
+      "loss": 0.6523,
+      "step": 3036
+    },
+    {
+      "epoch": 0.5399111111111111,
+      "grad_norm": 0.3781378245497815,
+      "learning_rate": 9.194745659932105e-05,
+      "loss": 0.6195,
+      "step": 3037
+    },
+    {
+      "epoch": 0.5400888888888888,
+      "grad_norm": 0.4171924019844307,
+      "learning_rate": 9.189006441647799e-05,
+      "loss": 0.6305,
+      "step": 3038
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.3824494785777442,
+      "learning_rate": 9.183267492249635e-05,
+      "loss": 0.598,
+      "step": 3039
+    },
+    {
+      "epoch": 0.5404444444444444,
+      "grad_norm": 0.3513291151204152,
+      "learning_rate": 9.177528813640362e-05,
+      "loss": 0.5873,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5406222222222222,
+      "grad_norm": 0.35935904618984305,
+      "learning_rate": 9.171790407722656e-05,
+      "loss": 0.5844,
+      "step": 3041
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3441305387135324,
+      "learning_rate": 9.166052276399088e-05,
+      "loss": 0.6182,
+      "step": 3042
+    },
+    {
+      "epoch": 0.5409777777777778,
+      "grad_norm": 0.38530641545228766,
+      "learning_rate": 9.160314421572152e-05,
+      "loss": 0.5529,
+      "step": 3043
+    },
+    {
+      "epoch": 0.5411555555555555,
+      "grad_norm": 0.36894512800058743,
+      "learning_rate": 9.154576845144231e-05,
+      "loss": 0.6071,
+      "step": 3044
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.3581027666785767,
+      "learning_rate": 9.148839549017639e-05,
+      "loss": 0.6237,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5415111111111112,
+      "grad_norm": 0.3450882268316929,
+      "learning_rate": 9.143102535094575e-05,
+      "loss": 0.6142,
+      "step": 3046
+    },
+    {
+      "epoch": 0.5416888888888889,
+      "grad_norm": 0.4960994345055827,
+      "learning_rate": 9.137365805277164e-05,
+      "loss": 0.568,
+      "step": 3047
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.35924534009725706,
+      "learning_rate": 9.13162936146742e-05,
+      "loss": 0.6158,
+      "step": 3048
+    },
+    {
+      "epoch": 0.5420444444444444,
+      "grad_norm": 0.3667035458947974,
+      "learning_rate": 9.125893205567273e-05,
+      "loss": 0.6148,
+      "step": 3049
+    },
+    {
+      "epoch": 0.5422222222222223,
+      "grad_norm": 0.36158423398438444,
+      "learning_rate": 9.120157339478555e-05,
+      "loss": 0.6246,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.34326281940349385,
+      "learning_rate": 9.114421765102999e-05,
+      "loss": 0.5021,
+      "step": 3051
+    },
+    {
+      "epoch": 0.5425777777777778,
+      "grad_norm": 0.35569674283341973,
+      "learning_rate": 9.108686484342241e-05,
+      "loss": 0.6413,
+      "step": 3052
+    },
+    {
+      "epoch": 0.5427555555555555,
+      "grad_norm": 0.34389443820476223,
+      "learning_rate": 9.102951499097829e-05,
+      "loss": 0.5538,
+      "step": 3053
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.35396625946768323,
+      "learning_rate": 9.097216811271199e-05,
+      "loss": 0.6344,
+      "step": 3054
+    },
+    {
+      "epoch": 0.5431111111111111,
+      "grad_norm": 0.341088389696483,
+      "learning_rate": 9.0914824227637e-05,
+      "loss": 0.6164,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5432888888888889,
+      "grad_norm": 0.36311799866731137,
+      "learning_rate": 9.085748335476573e-05,
+      "loss": 0.5969,
+      "step": 3056
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.3531098612566486,
+      "learning_rate": 9.08001455131097e-05,
+      "loss": 0.5955,
+      "step": 3057
+    },
+    {
+      "epoch": 0.5436444444444445,
+      "grad_norm": 0.3562884086810283,
+      "learning_rate": 9.074281072167928e-05,
+      "loss": 0.609,
+      "step": 3058
+    },
+    {
+      "epoch": 0.5438222222222222,
+      "grad_norm": 0.346151980360336,
+      "learning_rate": 9.068547899948396e-05,
+      "loss": 0.6094,
+      "step": 3059
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3731839449531166,
+      "learning_rate": 9.062815036553213e-05,
+      "loss": 0.6181,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5441777777777778,
+      "grad_norm": 0.34747862061774065,
+      "learning_rate": 9.057082483883122e-05,
+      "loss": 0.6033,
+      "step": 3061
+    },
+    {
+      "epoch": 0.5443555555555556,
+      "grad_norm": 0.3392560270491025,
+      "learning_rate": 9.051350243838756e-05,
+      "loss": 0.5926,
+      "step": 3062
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.34041497572993407,
+      "learning_rate": 9.045618318320651e-05,
+      "loss": 0.5691,
+      "step": 3063
+    },
+    {
+      "epoch": 0.5447111111111111,
+      "grad_norm": 0.34623108779299117,
+      "learning_rate": 9.039886709229229e-05,
+      "loss": 0.6024,
+      "step": 3064
+    },
+    {
+      "epoch": 0.5448888888888889,
+      "grad_norm": 0.37725364633926356,
+      "learning_rate": 9.034155418464823e-05,
+      "loss": 0.6451,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.3545440476571932,
+      "learning_rate": 9.028424447927641e-05,
+      "loss": 0.5789,
+      "step": 3066
+    },
+    {
+      "epoch": 0.5452444444444444,
+      "grad_norm": 0.33553046371123374,
+      "learning_rate": 9.022693799517806e-05,
+      "loss": 0.6083,
+      "step": 3067
+    },
+    {
+      "epoch": 0.5454222222222223,
+      "grad_norm": 0.3649647731076927,
+      "learning_rate": 9.016963475135313e-05,
+      "loss": 0.5908,
+      "step": 3068
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3646861002799353,
+      "learning_rate": 9.011233476680067e-05,
+      "loss": 0.6235,
+      "step": 3069
+    },
+    {
+      "epoch": 0.5457777777777778,
+      "grad_norm": 0.3588797191871229,
+      "learning_rate": 9.005503806051853e-05,
+      "loss": 0.6129,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5459555555555555,
+      "grad_norm": 0.3656544796879238,
+      "learning_rate": 8.999774465150356e-05,
+      "loss": 0.6313,
+      "step": 3071
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.3511686649342916,
+      "learning_rate": 8.994045455875142e-05,
+      "loss": 0.6027,
+      "step": 3072
+    },
+    {
+      "epoch": 0.5463111111111111,
+      "grad_norm": 0.3528330258003095,
+      "learning_rate": 8.98831678012568e-05,
+      "loss": 0.5755,
+      "step": 3073
+    },
+    {
+      "epoch": 0.5464888888888889,
+      "grad_norm": 0.37001058784158514,
+      "learning_rate": 8.982588439801314e-05,
+      "loss": 0.6382,
+      "step": 3074
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.3509138043944784,
+      "learning_rate": 8.976860436801291e-05,
+      "loss": 0.5873,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5468444444444445,
+      "grad_norm": 0.35842961480793795,
+      "learning_rate": 8.97113277302473e-05,
+      "loss": 0.6716,
+      "step": 3076
+    },
+    {
+      "epoch": 0.5470222222222222,
+      "grad_norm": 0.3702768845996892,
+      "learning_rate": 8.965405450370655e-05,
+      "loss": 0.6196,
+      "step": 3077
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.3693486007256979,
+      "learning_rate": 8.959678470737962e-05,
+      "loss": 0.6085,
+      "step": 3078
+    },
+    {
+      "epoch": 0.5473777777777777,
+      "grad_norm": 0.3669139475822687,
+      "learning_rate": 8.953951836025446e-05,
+      "loss": 0.6296,
+      "step": 3079
+    },
+    {
+      "epoch": 0.5475555555555556,
+      "grad_norm": 0.36325698313278154,
+      "learning_rate": 8.948225548131775e-05,
+      "loss": 0.6286,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.35813603385671217,
+      "learning_rate": 8.942499608955516e-05,
+      "loss": 0.5871,
+      "step": 3081
+    },
+    {
+      "epoch": 0.5479111111111111,
+      "grad_norm": 0.3641911072277507,
+      "learning_rate": 8.936774020395103e-05,
+      "loss": 0.6287,
+      "step": 3082
+    },
+    {
+      "epoch": 0.5480888888888888,
+      "grad_norm": 0.3555746540746553,
+      "learning_rate": 8.931048784348875e-05,
+      "loss": 0.584,
+      "step": 3083
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.5759905763971599,
+      "learning_rate": 8.925323902715031e-05,
+      "loss": 0.6031,
+      "step": 3084
+    },
+    {
+      "epoch": 0.5484444444444444,
+      "grad_norm": 0.37142069058373095,
+      "learning_rate": 8.919599377391673e-05,
+      "loss": 0.5777,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5486222222222222,
+      "grad_norm": 0.35549108718528766,
+      "learning_rate": 8.913875210276772e-05,
+      "loss": 0.6154,
+      "step": 3086
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.44243168121754856,
+      "learning_rate": 8.908151403268184e-05,
+      "loss": 0.6181,
+      "step": 3087
+    },
+    {
+      "epoch": 0.5489777777777778,
+      "grad_norm": 0.343253774873771,
+      "learning_rate": 8.902427958263648e-05,
+      "loss": 0.5662,
+      "step": 3088
+    },
+    {
+      "epoch": 0.5491555555555555,
+      "grad_norm": 0.39428861280338556,
+      "learning_rate": 8.896704877160782e-05,
+      "loss": 0.6282,
+      "step": 3089
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.3653713214750227,
+      "learning_rate": 8.890982161857076e-05,
+      "loss": 0.5895,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5495111111111111,
+      "grad_norm": 0.3471673275406715,
+      "learning_rate": 8.885259814249911e-05,
+      "loss": 0.6172,
+      "step": 3091
+    },
+    {
+      "epoch": 0.5496888888888889,
+      "grad_norm": 0.3665147045347773,
+      "learning_rate": 8.879537836236536e-05,
+      "loss": 0.6507,
+      "step": 3092
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.3592818666062855,
+      "learning_rate": 8.873816229714084e-05,
+      "loss": 0.624,
+      "step": 3093
+    },
+    {
+      "epoch": 0.5500444444444444,
+      "grad_norm": 0.34200932991818483,
+      "learning_rate": 8.868094996579561e-05,
+      "loss": 0.5988,
+      "step": 3094
+    },
+    {
+      "epoch": 0.5502222222222222,
+      "grad_norm": 0.34951040669998445,
+      "learning_rate": 8.862374138729853e-05,
+      "loss": 0.5816,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3392300153933995,
+      "learning_rate": 8.856653658061713e-05,
+      "loss": 0.6101,
+      "step": 3096
+    },
+    {
+      "epoch": 0.5505777777777778,
+      "grad_norm": 0.3660001741569265,
+      "learning_rate": 8.850933556471785e-05,
+      "loss": 0.5924,
+      "step": 3097
+    },
+    {
+      "epoch": 0.5507555555555556,
+      "grad_norm": 0.3530203304048615,
+      "learning_rate": 8.845213835856568e-05,
+      "loss": 0.6034,
+      "step": 3098
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.349978795488595,
+      "learning_rate": 8.839494498112447e-05,
+      "loss": 0.5489,
+      "step": 3099
+    },
+    {
+      "epoch": 0.5511111111111111,
+      "grad_norm": 0.3571641427643841,
+      "learning_rate": 8.833775545135678e-05,
+      "loss": 0.6479,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5512888888888889,
+      "grad_norm": 0.3642650747991144,
+      "learning_rate": 8.82805697882239e-05,
+      "loss": 0.6121,
+      "step": 3101
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.377636787329736,
+      "learning_rate": 8.822338801068575e-05,
+      "loss": 0.6204,
+      "step": 3102
+    },
+    {
+      "epoch": 0.5516444444444445,
+      "grad_norm": 0.34967875380412294,
+      "learning_rate": 8.816621013770114e-05,
+      "loss": 0.6137,
+      "step": 3103
+    },
+    {
+      "epoch": 0.5518222222222222,
+      "grad_norm": 0.40166395689072415,
+      "learning_rate": 8.810903618822739e-05,
+      "loss": 0.6482,
+      "step": 3104
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3770578571856448,
+      "learning_rate": 8.805186618122068e-05,
+      "loss": 0.6411,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5521777777777778,
+      "grad_norm": 0.3911401868679578,
+      "learning_rate": 8.799470013563573e-05,
+      "loss": 0.6278,
+      "step": 3106
+    },
+    {
+      "epoch": 0.5523555555555556,
+      "grad_norm": 0.3364077173465212,
+      "learning_rate": 8.793753807042613e-05,
+      "loss": 0.5746,
+      "step": 3107
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.3699284574000947,
+      "learning_rate": 8.788038000454395e-05,
+      "loss": 0.6174,
+      "step": 3108
+    },
+    {
+      "epoch": 0.5527111111111112,
+      "grad_norm": 0.41597305088654535,
+      "learning_rate": 8.782322595694012e-05,
+      "loss": 0.6114,
+      "step": 3109
+    },
+    {
+      "epoch": 0.5528888888888889,
+      "grad_norm": 0.37236736217163985,
+      "learning_rate": 8.77660759465641e-05,
+      "loss": 0.6186,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.33558960816760525,
+      "learning_rate": 8.770892999236405e-05,
+      "loss": 0.5467,
+      "step": 3111
+    },
+    {
+      "epoch": 0.5532444444444444,
+      "grad_norm": 0.360362982104993,
+      "learning_rate": 8.765178811328684e-05,
+      "loss": 0.6446,
+      "step": 3112
+    },
+    {
+      "epoch": 0.5534222222222223,
+      "grad_norm": 0.364594648735862,
+      "learning_rate": 8.759465032827794e-05,
+      "loss": 0.5886,
+      "step": 3113
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.36715831679915656,
+      "learning_rate": 8.753751665628141e-05,
+      "loss": 0.5832,
+      "step": 3114
+    },
+    {
+      "epoch": 0.5537777777777778,
+      "grad_norm": 0.35640763908920553,
+      "learning_rate": 8.74803871162401e-05,
+      "loss": 0.6061,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5539555555555555,
+      "grad_norm": 0.35505394075035235,
+      "learning_rate": 8.74232617270953e-05,
+      "loss": 0.6006,
+      "step": 3116
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.35864570486856606,
+      "learning_rate": 8.73661405077871e-05,
+      "loss": 0.6114,
+      "step": 3117
+    },
+    {
+      "epoch": 0.5543111111111111,
+      "grad_norm": 0.3477896098016776,
+      "learning_rate": 8.730902347725406e-05,
+      "loss": 0.6191,
+      "step": 3118
+    },
+    {
+      "epoch": 0.5544888888888889,
+      "grad_norm": 0.34130412328102105,
+      "learning_rate": 8.725191065443348e-05,
+      "loss": 0.5663,
+      "step": 3119
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.3485717646431946,
+      "learning_rate": 8.719480205826113e-05,
+      "loss": 0.5826,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5548444444444445,
+      "grad_norm": 0.3902562888116477,
+      "learning_rate": 8.713769770767155e-05,
+      "loss": 0.6397,
+      "step": 3121
+    },
+    {
+      "epoch": 0.5550222222222222,
+      "grad_norm": 0.36643025161550385,
+      "learning_rate": 8.708059762159768e-05,
+      "loss": 0.5958,
+      "step": 3122
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3640031617331684,
+      "learning_rate": 8.702350181897118e-05,
+      "loss": 0.633,
+      "step": 3123
+    },
+    {
+      "epoch": 0.5553777777777777,
+      "grad_norm": 0.3465618679460059,
+      "learning_rate": 8.696641031872224e-05,
+      "loss": 0.5949,
+      "step": 3124
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 0.3488210519703397,
+      "learning_rate": 8.690932313977967e-05,
+      "loss": 0.5929,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.3515530668214604,
+      "learning_rate": 8.685224030107074e-05,
+      "loss": 0.6046,
+      "step": 3126
+    },
+    {
+      "epoch": 0.5559111111111111,
+      "grad_norm": 0.3666684164577866,
+      "learning_rate": 8.679516182152142e-05,
+      "loss": 0.6072,
+      "step": 3127
+    },
+    {
+      "epoch": 0.5560888888888889,
+      "grad_norm": 0.34904269030972257,
+      "learning_rate": 8.67380877200561e-05,
+      "loss": 0.5815,
+      "step": 3128
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.36260934065996636,
+      "learning_rate": 8.668101801559786e-05,
+      "loss": 0.5884,
+      "step": 3129
+    },
+    {
+      "epoch": 0.5564444444444444,
+      "grad_norm": 0.3628142272416488,
+      "learning_rate": 8.662395272706816e-05,
+      "loss": 0.5717,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5566222222222222,
+      "grad_norm": 0.3459240506705345,
+      "learning_rate": 8.656689187338719e-05,
+      "loss": 0.6121,
+      "step": 3131
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.344497018084573,
+      "learning_rate": 8.650983547347344e-05,
+      "loss": 0.6168,
+      "step": 3132
+    },
+    {
+      "epoch": 0.5569777777777778,
+      "grad_norm": 0.35586702287311356,
+      "learning_rate": 8.645278354624417e-05,
+      "loss": 0.5718,
+      "step": 3133
+    },
+    {
+      "epoch": 0.5571555555555555,
+      "grad_norm": 0.3322156957893119,
+      "learning_rate": 8.639573611061493e-05,
+      "loss": 0.5861,
+      "step": 3134
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.3476549058512365,
+      "learning_rate": 8.633869318549994e-05,
+      "loss": 0.607,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5575111111111111,
+      "grad_norm": 0.37275228473645716,
+      "learning_rate": 8.628165478981184e-05,
+      "loss": 0.5912,
+      "step": 3136
+    },
+    {
+      "epoch": 0.5576888888888889,
+      "grad_norm": 0.36493057153297015,
+      "learning_rate": 8.622462094246184e-05,
+      "loss": 0.6098,
+      "step": 3137
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.37000568478540485,
+      "learning_rate": 8.616759166235953e-05,
+      "loss": 0.6161,
+      "step": 3138
+    },
+    {
+      "epoch": 0.5580444444444445,
+      "grad_norm": 0.35943889795557477,
+      "learning_rate": 8.611056696841312e-05,
+      "loss": 0.565,
+      "step": 3139
+    },
+    {
+      "epoch": 0.5582222222222222,
+      "grad_norm": 0.3481778216699788,
+      "learning_rate": 8.605354687952915e-05,
+      "loss": 0.5788,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.351397478124162,
+      "learning_rate": 8.599653141461283e-05,
+      "loss": 0.6196,
+      "step": 3141
+    },
+    {
+      "epoch": 0.5585777777777777,
+      "grad_norm": 0.4284752981764255,
+      "learning_rate": 8.593952059256762e-05,
+      "loss": 0.6028,
+      "step": 3142
+    },
+    {
+      "epoch": 0.5587555555555556,
+      "grad_norm": 0.34091581138652954,
+      "learning_rate": 8.588251443229563e-05,
+      "loss": 0.6181,
+      "step": 3143
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.33577299657976956,
+      "learning_rate": 8.582551295269726e-05,
+      "loss": 0.5781,
+      "step": 3144
+    },
+    {
+      "epoch": 0.5591111111111111,
+      "grad_norm": 0.35497584121487163,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.6502,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5592888888888888,
+      "grad_norm": 0.33747314274775486,
+      "learning_rate": 8.571152411111568e-05,
+      "loss": 0.6141,
+      "step": 3146
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.344317610914493,
+      "learning_rate": 8.565453678692561e-05,
+      "loss": 0.6049,
+      "step": 3147
+    },
+    {
+      "epoch": 0.5596444444444445,
+      "grad_norm": 0.3653650100436146,
+      "learning_rate": 8.559755421899554e-05,
+      "loss": 0.5983,
+      "step": 3148
+    },
+    {
+      "epoch": 0.5598222222222222,
+      "grad_norm": 0.35753188265371255,
+      "learning_rate": 8.554057642621813e-05,
+      "loss": 0.5594,
+      "step": 3149
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.34542644785606025,
+      "learning_rate": 8.54836034274844e-05,
+      "loss": 0.6067,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5601777777777778,
+      "grad_norm": 0.3546525697413544,
+      "learning_rate": 8.54266352416839e-05,
+      "loss": 0.6302,
+      "step": 3151
+    },
+    {
+      "epoch": 0.5603555555555556,
+      "grad_norm": 0.3801307029221448,
+      "learning_rate": 8.536967188770447e-05,
+      "loss": 0.5578,
+      "step": 3152
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.4333945897184056,
+      "learning_rate": 8.531271338443245e-05,
+      "loss": 0.5995,
+      "step": 3153
+    },
+    {
+      "epoch": 0.5607111111111112,
+      "grad_norm": 0.3794676206788478,
+      "learning_rate": 8.525575975075243e-05,
+      "loss": 0.6231,
+      "step": 3154
+    },
+    {
+      "epoch": 0.5608888888888889,
+      "grad_norm": 0.3445862961150107,
+      "learning_rate": 8.519881100554758e-05,
+      "loss": 0.5897,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.34775161594446613,
+      "learning_rate": 8.514186716769924e-05,
+      "loss": 0.5942,
+      "step": 3156
+    },
+    {
+      "epoch": 0.5612444444444444,
+      "grad_norm": 0.3543524040787787,
+      "learning_rate": 8.508492825608733e-05,
+      "loss": 0.5821,
+      "step": 3157
+    },
+    {
+      "epoch": 0.5614222222222223,
+      "grad_norm": 0.36300900376343004,
+      "learning_rate": 8.502799428958994e-05,
+      "loss": 0.5952,
+      "step": 3158
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3597064410580433,
+      "learning_rate": 8.497106528708368e-05,
+      "loss": 0.6607,
+      "step": 3159
+    },
+    {
+      "epoch": 0.5617777777777778,
+      "grad_norm": 0.3506203636181199,
+      "learning_rate": 8.491414126744339e-05,
+      "loss": 0.5876,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5619555555555555,
+      "grad_norm": 0.3617457160387125,
+      "learning_rate": 8.485722224954237e-05,
+      "loss": 0.6384,
+      "step": 3161
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.38679068330514044,
+      "learning_rate": 8.480030825225213e-05,
+      "loss": 0.6219,
+      "step": 3162
+    },
+    {
+      "epoch": 0.5623111111111111,
+      "grad_norm": 0.34490811120009385,
+      "learning_rate": 8.47433992944427e-05,
+      "loss": 0.5689,
+      "step": 3163
+    },
+    {
+      "epoch": 0.5624888888888889,
+      "grad_norm": 0.36722133181190103,
+      "learning_rate": 8.46864953949822e-05,
+      "loss": 0.6173,
+      "step": 3164
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.38256267314838216,
+      "learning_rate": 8.462959657273733e-05,
+      "loss": 0.5816,
+      "step": 3165
+    },
+    {
+      "epoch": 0.5628444444444445,
+      "grad_norm": 0.3775268343160406,
+      "learning_rate": 8.45727028465729e-05,
+      "loss": 0.5871,
+      "step": 3166
+    },
+    {
+      "epoch": 0.5630222222222222,
+      "grad_norm": 0.36256538279905787,
+      "learning_rate": 8.451581423535216e-05,
+      "loss": 0.5537,
+      "step": 3167
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.35620464035234517,
+      "learning_rate": 8.445893075793654e-05,
+      "loss": 0.5896,
+      "step": 3168
+    },
+    {
+      "epoch": 0.5633777777777778,
+      "grad_norm": 0.3646897876478277,
+      "learning_rate": 8.440205243318595e-05,
+      "loss": 0.5723,
+      "step": 3169
+    },
+    {
+      "epoch": 0.5635555555555556,
+      "grad_norm": 0.3747770265453073,
+      "learning_rate": 8.434517927995837e-05,
+      "loss": 0.578,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.37331941498167825,
+      "learning_rate": 8.42883113171103e-05,
+      "loss": 0.6196,
+      "step": 3171
+    },
+    {
+      "epoch": 0.5639111111111111,
+      "grad_norm": 0.37541078837816255,
+      "learning_rate": 8.423144856349631e-05,
+      "loss": 0.6456,
+      "step": 3172
+    },
+    {
+      "epoch": 0.5640888888888889,
+      "grad_norm": 0.3559919088487258,
+      "learning_rate": 8.417459103796934e-05,
+      "loss": 0.6194,
+      "step": 3173
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.34176307523197985,
+      "learning_rate": 8.411773875938062e-05,
+      "loss": 0.6091,
+      "step": 3174
+    },
+    {
+      "epoch": 0.5644444444444444,
+      "grad_norm": 0.36413620864705437,
+      "learning_rate": 8.406089174657963e-05,
+      "loss": 0.6342,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5646222222222222,
+      "grad_norm": 0.3507592133296721,
+      "learning_rate": 8.400405001841399e-05,
+      "loss": 0.6182,
+      "step": 3176
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.36304011525177293,
+      "learning_rate": 8.394721359372977e-05,
+      "loss": 0.6337,
+      "step": 3177
+    },
+    {
+      "epoch": 0.5649777777777778,
+      "grad_norm": 0.3826975668612223,
+      "learning_rate": 8.389038249137107e-05,
+      "loss": 0.6457,
+      "step": 3178
+    },
+    {
+      "epoch": 0.5651555555555555,
+      "grad_norm": 0.3642988280175765,
+      "learning_rate": 8.383355673018042e-05,
+      "loss": 0.5975,
+      "step": 3179
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.35835731920280894,
+      "learning_rate": 8.37767363289984e-05,
+      "loss": 0.5828,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5655111111111111,
+      "grad_norm": 0.35282122749763534,
+      "learning_rate": 8.371992130666402e-05,
+      "loss": 0.6017,
+      "step": 3181
+    },
+    {
+      "epoch": 0.5656888888888889,
+      "grad_norm": 0.37653502126753874,
+      "learning_rate": 8.366311168201424e-05,
+      "loss": 0.5912,
+      "step": 3182
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.355864785060337,
+      "learning_rate": 8.36063074738845e-05,
+      "loss": 0.5955,
+      "step": 3183
+    },
+    {
+      "epoch": 0.5660444444444445,
+      "grad_norm": 0.44217982011143075,
+      "learning_rate": 8.354950870110825e-05,
+      "loss": 0.5671,
+      "step": 3184
+    },
+    {
+      "epoch": 0.5662222222222222,
+      "grad_norm": 0.3819307115013125,
+      "learning_rate": 8.349271538251723e-05,
+      "loss": 0.6272,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.37564114361939993,
+      "learning_rate": 8.343592753694135e-05,
+      "loss": 0.5794,
+      "step": 3186
+    },
+    {
+      "epoch": 0.5665777777777777,
+      "grad_norm": 0.3547291479327033,
+      "learning_rate": 8.337914518320873e-05,
+      "loss": 0.6149,
+      "step": 3187
+    },
+    {
+      "epoch": 0.5667555555555556,
+      "grad_norm": 0.483125549496956,
+      "learning_rate": 8.332236834014557e-05,
+      "loss": 0.6427,
+      "step": 3188
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.34078864154225064,
+      "learning_rate": 8.326559702657642e-05,
+      "loss": 0.5659,
+      "step": 3189
+    },
+    {
+      "epoch": 0.5671111111111111,
+      "grad_norm": 0.3483396814003731,
+      "learning_rate": 8.320883126132379e-05,
+      "loss": 0.5937,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5672888888888888,
+      "grad_norm": 0.3558735546193817,
+      "learning_rate": 8.315207106320856e-05,
+      "loss": 0.5945,
+      "step": 3191
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.3388676937189465,
+      "learning_rate": 8.309531645104957e-05,
+      "loss": 0.5916,
+      "step": 3192
+    },
+    {
+      "epoch": 0.5676444444444444,
+      "grad_norm": 0.3794106802209468,
+      "learning_rate": 8.303856744366396e-05,
+      "loss": 0.6703,
+      "step": 3193
+    },
+    {
+      "epoch": 0.5678222222222222,
+      "grad_norm": 0.35055071449610203,
+      "learning_rate": 8.298182405986689e-05,
+      "loss": 0.5758,
+      "step": 3194
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3570849064302238,
+      "learning_rate": 8.29250863184718e-05,
+      "loss": 0.6086,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5681777777777778,
+      "grad_norm": 0.3612083756158218,
+      "learning_rate": 8.28683542382901e-05,
+      "loss": 0.5962,
+      "step": 3196
+    },
+    {
+      "epoch": 0.5683555555555555,
+      "grad_norm": 0.3413380974468956,
+      "learning_rate": 8.281162783813142e-05,
+      "loss": 0.6194,
+      "step": 3197
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.3351282573369659,
+      "learning_rate": 8.275490713680349e-05,
+      "loss": 0.5922,
+      "step": 3198
+    },
+    {
+      "epoch": 0.5687111111111111,
+      "grad_norm": 0.32870786739907515,
+      "learning_rate": 8.269819215311215e-05,
+      "loss": 0.5346,
+      "step": 3199
+    },
+    {
+      "epoch": 0.5688888888888889,
+      "grad_norm": 0.42148297449214084,
+      "learning_rate": 8.264148290586128e-05,
+      "loss": 0.5908,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.3597229690057992,
+      "learning_rate": 8.2584779413853e-05,
+      "loss": 0.5803,
+      "step": 3201
+    },
+    {
+      "epoch": 0.5692444444444444,
+      "grad_norm": 0.3529049587456271,
+      "learning_rate": 8.252808169588737e-05,
+      "loss": 0.586,
+      "step": 3202
+    },
+    {
+      "epoch": 0.5694222222222223,
+      "grad_norm": 0.3390960192350595,
+      "learning_rate": 8.247138977076268e-05,
+      "loss": 0.5748,
+      "step": 3203
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.35366625829077525,
+      "learning_rate": 8.241470365727512e-05,
+      "loss": 0.5928,
+      "step": 3204
+    },
+    {
+      "epoch": 0.5697777777777778,
+      "grad_norm": 0.34860352320708177,
+      "learning_rate": 8.235802337421919e-05,
+      "loss": 0.5949,
+      "step": 3205
+    },
+    {
+      "epoch": 0.5699555555555555,
+      "grad_norm": 0.3760047648701897,
+      "learning_rate": 8.230134894038717e-05,
+      "loss": 0.6489,
+      "step": 3206
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.3497018581908783,
+      "learning_rate": 8.224468037456969e-05,
+      "loss": 0.5848,
+      "step": 3207
+    },
+    {
+      "epoch": 0.5703111111111111,
+      "grad_norm": 0.3383106023210154,
+      "learning_rate": 8.218801769555522e-05,
+      "loss": 0.5821,
+      "step": 3208
+    },
+    {
+      "epoch": 0.5704888888888889,
+      "grad_norm": 0.34557003005831516,
+      "learning_rate": 8.213136092213039e-05,
+      "loss": 0.5815,
+      "step": 3209
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.345902953229418,
+      "learning_rate": 8.20747100730798e-05,
+      "loss": 0.6137,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5708444444444445,
+      "grad_norm": 0.36050005168293403,
+      "learning_rate": 8.20180651671862e-05,
+      "loss": 0.6114,
+      "step": 3211
+    },
+    {
+      "epoch": 0.5710222222222222,
+      "grad_norm": 0.3539116096470461,
+      "learning_rate": 8.196142622323018e-05,
+      "loss": 0.595,
+      "step": 3212
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.34764928464885,
+      "learning_rate": 8.190479325999059e-05,
+      "loss": 0.6394,
+      "step": 3213
+    },
+    {
+      "epoch": 0.5713777777777778,
+      "grad_norm": 0.3335957421014551,
+      "learning_rate": 8.184816629624406e-05,
+      "loss": 0.5828,
+      "step": 3214
+    },
+    {
+      "epoch": 0.5715555555555556,
+      "grad_norm": 0.33821529916234605,
+      "learning_rate": 8.179154535076546e-05,
+      "loss": 0.6084,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.3466593247785601,
+      "learning_rate": 8.173493044232745e-05,
+      "loss": 0.58,
+      "step": 3216
+    },
+    {
+      "epoch": 0.5719111111111111,
+      "grad_norm": 0.34376603334207395,
+      "learning_rate": 8.167832158970087e-05,
+      "loss": 0.5852,
+      "step": 3217
+    },
+    {
+      "epoch": 0.5720888888888889,
+      "grad_norm": 0.3410878199409463,
+      "learning_rate": 8.162171881165439e-05,
+      "loss": 0.5353,
+      "step": 3218
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.33581722023083194,
+      "learning_rate": 8.156512212695481e-05,
+      "loss": 0.5958,
+      "step": 3219
+    },
+    {
+      "epoch": 0.5724444444444444,
+      "grad_norm": 0.3876139758830211,
+      "learning_rate": 8.150853155436684e-05,
+      "loss": 0.6065,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5726222222222223,
+      "grad_norm": 0.366702189552425,
+      "learning_rate": 8.145194711265313e-05,
+      "loss": 0.5829,
+      "step": 3221
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3747886781175315,
+      "learning_rate": 8.139536882057437e-05,
+      "loss": 0.6315,
+      "step": 3222
+    },
+    {
+      "epoch": 0.5729777777777778,
+      "grad_norm": 0.35944879676503755,
+      "learning_rate": 8.133879669688919e-05,
+      "loss": 0.5808,
+      "step": 3223
+    },
+    {
+      "epoch": 0.5731555555555555,
+      "grad_norm": 0.3731052388063133,
+      "learning_rate": 8.128223076035409e-05,
+      "loss": 0.6269,
+      "step": 3224
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.37189423198335936,
+      "learning_rate": 8.12256710297237e-05,
+      "loss": 0.5975,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5735111111111111,
+      "grad_norm": 0.3605741705374192,
+      "learning_rate": 8.116911752375037e-05,
+      "loss": 0.6006,
+      "step": 3226
+    },
+    {
+      "epoch": 0.5736888888888889,
+      "grad_norm": 0.3304544761838582,
+      "learning_rate": 8.11125702611846e-05,
+      "loss": 0.551,
+      "step": 3227
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.3558160543577562,
+      "learning_rate": 8.105602926077466e-05,
+      "loss": 0.5521,
+      "step": 3228
+    },
+    {
+      "epoch": 0.5740444444444445,
+      "grad_norm": 0.3253695049622035,
+      "learning_rate": 8.099949454126685e-05,
+      "loss": 0.5248,
+      "step": 3229
+    },
+    {
+      "epoch": 0.5742222222222222,
+      "grad_norm": 0.36688875291056017,
+      "learning_rate": 8.094296612140527e-05,
+      "loss": 0.5809,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3625188073755832,
+      "learning_rate": 8.08864440199321e-05,
+      "loss": 0.6023,
+      "step": 3231
+    },
+    {
+      "epoch": 0.5745777777777777,
+      "grad_norm": 0.3865779612828931,
+      "learning_rate": 8.082992825558725e-05,
+      "loss": 0.6231,
+      "step": 3232
+    },
+    {
+      "epoch": 0.5747555555555556,
+      "grad_norm": 0.3466544763316007,
+      "learning_rate": 8.077341884710862e-05,
+      "loss": 0.6212,
+      "step": 3233
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.31976588144951157,
+      "learning_rate": 8.0716915813232e-05,
+      "loss": 0.556,
+      "step": 3234
+    },
+    {
+      "epoch": 0.5751111111111111,
+      "grad_norm": 0.42158126591798334,
+      "learning_rate": 8.06604191726911e-05,
+      "loss": 0.6117,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5752888888888888,
+      "grad_norm": 0.34953454045791155,
+      "learning_rate": 8.060392894421738e-05,
+      "loss": 0.5842,
+      "step": 3236
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.34427122944642785,
+      "learning_rate": 8.054744514654033e-05,
+      "loss": 0.566,
+      "step": 3237
+    },
+    {
+      "epoch": 0.5756444444444444,
+      "grad_norm": 0.3572357181683326,
+      "learning_rate": 8.049096779838719e-05,
+      "loss": 0.5925,
+      "step": 3238
+    },
+    {
+      "epoch": 0.5758222222222222,
+      "grad_norm": 0.3567602357887107,
+      "learning_rate": 8.043449691848311e-05,
+      "loss": 0.5553,
+      "step": 3239
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3542586374741914,
+      "learning_rate": 8.037803252555119e-05,
+      "loss": 0.5725,
+      "step": 3240
+    },
+    {
+      "epoch": 0.5761777777777778,
+      "grad_norm": 0.3763144038913365,
+      "learning_rate": 8.032157463831216e-05,
+      "loss": 0.6047,
+      "step": 3241
+    },
+    {
+      "epoch": 0.5763555555555555,
+      "grad_norm": 0.34775200908543114,
+      "learning_rate": 8.026512327548481e-05,
+      "loss": 0.5933,
+      "step": 3242
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.3539013838006837,
+      "learning_rate": 8.020867845578561e-05,
+      "loss": 0.5878,
+      "step": 3243
+    },
+    {
+      "epoch": 0.5767111111111111,
+      "grad_norm": 0.3657847387277009,
+      "learning_rate": 8.015224019792897e-05,
+      "loss": 0.5533,
+      "step": 3244
+    },
+    {
+      "epoch": 0.5768888888888889,
+      "grad_norm": 0.3728235269669566,
+      "learning_rate": 8.009580852062705e-05,
+      "loss": 0.5967,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.3336254272730483,
+      "learning_rate": 8.003938344258989e-05,
+      "loss": 0.6004,
+      "step": 3246
+    },
+    {
+      "epoch": 0.5772444444444444,
+      "grad_norm": 0.3391704117672068,
+      "learning_rate": 7.998296498252525e-05,
+      "loss": 0.5806,
+      "step": 3247
+    },
+    {
+      "epoch": 0.5774222222222222,
+      "grad_norm": 0.35932536353209654,
+      "learning_rate": 7.992655315913884e-05,
+      "loss": 0.6297,
+      "step": 3248
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.37869935999021365,
+      "learning_rate": 7.987014799113397e-05,
+      "loss": 0.6298,
+      "step": 3249
+    },
+    {
+      "epoch": 0.5777777777777777,
+      "grad_norm": 0.3528996449807185,
+      "learning_rate": 7.9813749497212e-05,
+      "loss": 0.6214,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5779555555555556,
+      "grad_norm": 0.4077919215628851,
+      "learning_rate": 7.975735769607182e-05,
+      "loss": 0.642,
+      "step": 3251
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.34700963153372466,
+      "learning_rate": 7.97009726064103e-05,
+      "loss": 0.644,
+      "step": 3252
+    },
+    {
+      "epoch": 0.5783111111111111,
+      "grad_norm": 0.3603098568269623,
+      "learning_rate": 7.964459424692192e-05,
+      "loss": 0.6134,
+      "step": 3253
+    },
+    {
+      "epoch": 0.5784888888888889,
+      "grad_norm": 0.3549269215650164,
+      "learning_rate": 7.95882226362991e-05,
+      "loss": 0.6504,
+      "step": 3254
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.3652849680227186,
+      "learning_rate": 7.953185779323184e-05,
+      "loss": 0.6137,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5788444444444445,
+      "grad_norm": 0.3647760847861043,
+      "learning_rate": 7.947549973640805e-05,
+      "loss": 0.5687,
+      "step": 3256
+    },
+    {
+      "epoch": 0.5790222222222222,
+      "grad_norm": 0.42257931506560475,
+      "learning_rate": 7.941914848451332e-05,
+      "loss": 0.5655,
+      "step": 3257
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.36105815023804116,
+      "learning_rate": 7.9362804056231e-05,
+      "loss": 0.5665,
+      "step": 3258
+    },
+    {
+      "epoch": 0.5793777777777778,
+      "grad_norm": 0.393268462341893,
+      "learning_rate": 7.930646647024212e-05,
+      "loss": 0.5744,
+      "step": 3259
+    },
+    {
+      "epoch": 0.5795555555555556,
+      "grad_norm": 0.39559813499428703,
+      "learning_rate": 7.925013574522557e-05,
+      "loss": 0.5856,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.3836796112683418,
+      "learning_rate": 7.919381189985778e-05,
+      "loss": 0.6124,
+      "step": 3261
+    },
+    {
+      "epoch": 0.5799111111111112,
+      "grad_norm": 0.36247571443958754,
+      "learning_rate": 7.913749495281313e-05,
+      "loss": 0.5924,
+      "step": 3262
+    },
+    {
+      "epoch": 0.5800888888888889,
+      "grad_norm": 0.34618340316981183,
+      "learning_rate": 7.90811849227635e-05,
+      "loss": 0.6073,
+      "step": 3263
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.36381507288621134,
+      "learning_rate": 7.902488182837862e-05,
+      "loss": 0.6154,
+      "step": 3264
+    },
+    {
+      "epoch": 0.5804444444444444,
+      "grad_norm": 0.36460098681435715,
+      "learning_rate": 7.896858568832581e-05,
+      "loss": 0.6167,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5806222222222223,
+      "grad_norm": 0.3610372779962909,
+      "learning_rate": 7.891229652127019e-05,
+      "loss": 0.5785,
+      "step": 3266
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.34722416568103387,
+      "learning_rate": 7.885601434587451e-05,
+      "loss": 0.5892,
+      "step": 3267
+    },
+    {
+      "epoch": 0.5809777777777778,
+      "grad_norm": 0.34556477946590836,
+      "learning_rate": 7.879973918079917e-05,
+      "loss": 0.5652,
+      "step": 3268
+    },
+    {
+      "epoch": 0.5811555555555555,
+      "grad_norm": 0.38701087414100327,
+      "learning_rate": 7.874347104470234e-05,
+      "loss": 0.6337,
+      "step": 3269
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.34713506420533025,
+      "learning_rate": 7.868720995623979e-05,
+      "loss": 0.6082,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5815111111111111,
+      "grad_norm": 0.3446063806541194,
+      "learning_rate": 7.863095593406491e-05,
+      "loss": 0.5552,
+      "step": 3271
+    },
+    {
+      "epoch": 0.5816888888888889,
+      "grad_norm": 0.35607543569341626,
+      "learning_rate": 7.857470899682891e-05,
+      "loss": 0.625,
+      "step": 3272
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.3494569640832731,
+      "learning_rate": 7.851846916318046e-05,
+      "loss": 0.5939,
+      "step": 3273
+    },
+    {
+      "epoch": 0.5820444444444445,
+      "grad_norm": 0.3666921985855683,
+      "learning_rate": 7.846223645176601e-05,
+      "loss": 0.5892,
+      "step": 3274
+    },
+    {
+      "epoch": 0.5822222222222222,
+      "grad_norm": 0.3815520200790617,
+      "learning_rate": 7.840601088122956e-05,
+      "loss": 0.6154,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3736537771602796,
+      "learning_rate": 7.834979247021284e-05,
+      "loss": 0.6268,
+      "step": 3276
+    },
+    {
+      "epoch": 0.5825777777777777,
+      "grad_norm": 0.35001740396600817,
+      "learning_rate": 7.829358123735508e-05,
+      "loss": 0.5796,
+      "step": 3277
+    },
+    {
+      "epoch": 0.5827555555555556,
+      "grad_norm": 0.356787922303948,
+      "learning_rate": 7.823737720129329e-05,
+      "loss": 0.6633,
+      "step": 3278
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.35954809184534875,
+      "learning_rate": 7.818118038066192e-05,
+      "loss": 0.616,
+      "step": 3279
+    },
+    {
+      "epoch": 0.5831111111111111,
+      "grad_norm": 0.37199503467315925,
+      "learning_rate": 7.812499079409315e-05,
+      "loss": 0.6127,
+      "step": 3280
+    },
+    {
+      "epoch": 0.5832888888888889,
+      "grad_norm": 0.3594395678178431,
+      "learning_rate": 7.806880846021669e-05,
+      "loss": 0.5995,
+      "step": 3281
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.3308563876376353,
+      "learning_rate": 7.801263339765994e-05,
+      "loss": 0.5668,
+      "step": 3282
+    },
+    {
+      "epoch": 0.5836444444444444,
+      "grad_norm": 0.33622357801190716,
+      "learning_rate": 7.795646562504773e-05,
+      "loss": 0.6409,
+      "step": 3283
+    },
+    {
+      "epoch": 0.5838222222222222,
+      "grad_norm": 0.3682693074229111,
+      "learning_rate": 7.790030516100266e-05,
+      "loss": 0.6359,
+      "step": 3284
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3531784526940973,
+      "learning_rate": 7.784415202414477e-05,
+      "loss": 0.5617,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5841777777777778,
+      "grad_norm": 0.37353671756996487,
+      "learning_rate": 7.778800623309174e-05,
+      "loss": 0.5809,
+      "step": 3286
+    },
+    {
+      "epoch": 0.5843555555555555,
+      "grad_norm": 0.35310441129337927,
+      "learning_rate": 7.773186780645876e-05,
+      "loss": 0.5906,
+      "step": 3287
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.3537066659654453,
+      "learning_rate": 7.767573676285868e-05,
+      "loss": 0.6029,
+      "step": 3288
+    },
+    {
+      "epoch": 0.5847111111111111,
+      "grad_norm": 0.3438077129509418,
+      "learning_rate": 7.761961312090174e-05,
+      "loss": 0.5566,
+      "step": 3289
+    },
+    {
+      "epoch": 0.5848888888888889,
+      "grad_norm": 0.33716452222769244,
+      "learning_rate": 7.756349689919589e-05,
+      "loss": 0.5744,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.3621282096715922,
+      "learning_rate": 7.75073881163465e-05,
+      "loss": 0.6225,
+      "step": 3291
+    },
+    {
+      "epoch": 0.5852444444444445,
+      "grad_norm": 0.36523940835678276,
+      "learning_rate": 7.745128679095656e-05,
+      "loss": 0.5389,
+      "step": 3292
+    },
+    {
+      "epoch": 0.5854222222222222,
+      "grad_norm": 0.33802368799807025,
+      "learning_rate": 7.739519294162652e-05,
+      "loss": 0.566,
+      "step": 3293
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3607540558689247,
+      "learning_rate": 7.733910658695442e-05,
+      "loss": 0.6113,
+      "step": 3294
+    },
+    {
+      "epoch": 0.5857777777777777,
+      "grad_norm": 0.35774280066206815,
+      "learning_rate": 7.72830277455357e-05,
+      "loss": 0.5872,
+      "step": 3295
+    },
+    {
+      "epoch": 0.5859555555555556,
+      "grad_norm": 0.4344251113159798,
+      "learning_rate": 7.722695643596348e-05,
+      "loss": 0.6024,
+      "step": 3296
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.3536585333168312,
+      "learning_rate": 7.717089267682818e-05,
+      "loss": 0.6021,
+      "step": 3297
+    },
+    {
+      "epoch": 0.5863111111111111,
+      "grad_norm": 0.36773621638124965,
+      "learning_rate": 7.711483648671794e-05,
+      "loss": 0.6076,
+      "step": 3298
+    },
+    {
+      "epoch": 0.5864888888888888,
+      "grad_norm": 0.34752823390039894,
+      "learning_rate": 7.705878788421816e-05,
+      "loss": 0.6069,
+      "step": 3299
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.3513660436422426,
+      "learning_rate": 7.700274688791196e-05,
+      "loss": 0.6126,
+      "step": 3300
+    },
+    {
+      "epoch": 0.5868444444444444,
+      "grad_norm": 0.3530919828672039,
+      "learning_rate": 7.694671351637969e-05,
+      "loss": 0.5858,
+      "step": 3301
+    },
+    {
+      "epoch": 0.5870222222222222,
+      "grad_norm": 0.3452743729346822,
+      "learning_rate": 7.689068778819943e-05,
+      "loss": 0.6013,
+      "step": 3302
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.37701578988872997,
+      "learning_rate": 7.68346697219465e-05,
+      "loss": 0.6294,
+      "step": 3303
+    },
+    {
+      "epoch": 0.5873777777777778,
+      "grad_norm": 0.3496541077104658,
+      "learning_rate": 7.677865933619379e-05,
+      "loss": 0.5992,
+      "step": 3304
+    },
+    {
+      "epoch": 0.5875555555555556,
+      "grad_norm": 0.3866205948894247,
+      "learning_rate": 7.672265664951165e-05,
+      "loss": 0.6269,
+      "step": 3305
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.3436657532357334,
+      "learning_rate": 7.666666168046785e-05,
+      "loss": 0.5741,
+      "step": 3306
+    },
+    {
+      "epoch": 0.5879111111111112,
+      "grad_norm": 0.3449342997300359,
+      "learning_rate": 7.661067444762759e-05,
+      "loss": 0.5561,
+      "step": 3307
+    },
+    {
+      "epoch": 0.5880888888888889,
+      "grad_norm": 0.38107807862075255,
+      "learning_rate": 7.655469496955354e-05,
+      "loss": 0.578,
+      "step": 3308
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.33779229148129897,
+      "learning_rate": 7.649872326480577e-05,
+      "loss": 0.5831,
+      "step": 3309
+    },
+    {
+      "epoch": 0.5884444444444444,
+      "grad_norm": 0.36559427761031954,
+      "learning_rate": 7.64427593519418e-05,
+      "loss": 0.6128,
+      "step": 3310
+    },
+    {
+      "epoch": 0.5886222222222223,
+      "grad_norm": 0.3670247497724943,
+      "learning_rate": 7.638680324951649e-05,
+      "loss": 0.6287,
+      "step": 3311
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4552853524246501,
+      "learning_rate": 7.633085497608228e-05,
+      "loss": 0.5928,
+      "step": 3312
+    },
+    {
+      "epoch": 0.5889777777777778,
+      "grad_norm": 0.3513254046770352,
+      "learning_rate": 7.627491455018878e-05,
+      "loss": 0.5708,
+      "step": 3313
+    },
+    {
+      "epoch": 0.5891555555555555,
+      "grad_norm": 0.3678438236773794,
+      "learning_rate": 7.621898199038324e-05,
+      "loss": 0.5605,
+      "step": 3314
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.3619970818284411,
+      "learning_rate": 7.616305731521008e-05,
+      "loss": 0.6163,
+      "step": 3315
+    },
+    {
+      "epoch": 0.5895111111111111,
+      "grad_norm": 0.3565846036072688,
+      "learning_rate": 7.610714054321131e-05,
+      "loss": 0.5915,
+      "step": 3316
+    },
+    {
+      "epoch": 0.5896888888888889,
+      "grad_norm": 0.34663205177532047,
+      "learning_rate": 7.605123169292614e-05,
+      "loss": 0.6138,
+      "step": 3317
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.33375501029289745,
+      "learning_rate": 7.599533078289129e-05,
+      "loss": 0.552,
+      "step": 3318
+    },
+    {
+      "epoch": 0.5900444444444445,
+      "grad_norm": 0.375172816981703,
+      "learning_rate": 7.593943783164073e-05,
+      "loss": 0.6553,
+      "step": 3319
+    },
+    {
+      "epoch": 0.5902222222222222,
+      "grad_norm": 0.3511562235624918,
+      "learning_rate": 7.588355285770591e-05,
+      "loss": 0.565,
+      "step": 3320
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3432792388815817,
+      "learning_rate": 7.582767587961552e-05,
+      "loss": 0.5383,
+      "step": 3321
+    },
+    {
+      "epoch": 0.5905777777777778,
+      "grad_norm": 0.3398380046959315,
+      "learning_rate": 7.577180691589573e-05,
+      "loss": 0.5864,
+      "step": 3322
+    },
+    {
+      "epoch": 0.5907555555555556,
+      "grad_norm": 0.376956460317968,
+      "learning_rate": 7.57159459850699e-05,
+      "loss": 0.6085,
+      "step": 3323
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.397766913031596,
+      "learning_rate": 7.566009310565889e-05,
+      "loss": 0.6315,
+      "step": 3324
+    },
+    {
+      "epoch": 0.5911111111111111,
+      "grad_norm": 0.34255839182088776,
+      "learning_rate": 7.560424829618072e-05,
+      "loss": 0.5868,
+      "step": 3325
+    },
+    {
+      "epoch": 0.5912888888888889,
+      "grad_norm": 0.36139566313021204,
+      "learning_rate": 7.554841157515092e-05,
+      "loss": 0.6043,
+      "step": 3326
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.3440667395113186,
+      "learning_rate": 7.549258296108212e-05,
+      "loss": 0.5527,
+      "step": 3327
+    },
+    {
+      "epoch": 0.5916444444444444,
+      "grad_norm": 0.3519612045216992,
+      "learning_rate": 7.543676247248448e-05,
+      "loss": 0.6418,
+      "step": 3328
+    },
+    {
+      "epoch": 0.5918222222222222,
+      "grad_norm": 0.3529309503919504,
+      "learning_rate": 7.538095012786534e-05,
+      "loss": 0.6018,
+      "step": 3329
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.360342477817817,
+      "learning_rate": 7.532514594572934e-05,
+      "loss": 0.5708,
+      "step": 3330
+    },
+    {
+      "epoch": 0.5921777777777778,
+      "grad_norm": 0.33843209912459327,
+      "learning_rate": 7.526934994457844e-05,
+      "loss": 0.5735,
+      "step": 3331
+    },
+    {
+      "epoch": 0.5923555555555555,
+      "grad_norm": 0.35346176512951677,
+      "learning_rate": 7.521356214291196e-05,
+      "loss": 0.5558,
+      "step": 3332
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.3509996723425611,
+      "learning_rate": 7.515778255922632e-05,
+      "loss": 0.5593,
+      "step": 3333
+    },
+    {
+      "epoch": 0.5927111111111111,
+      "grad_norm": 0.35564686432479586,
+      "learning_rate": 7.510201121201543e-05,
+      "loss": 0.6214,
+      "step": 3334
+    },
+    {
+      "epoch": 0.5928888888888889,
+      "grad_norm": 0.37156628011523685,
+      "learning_rate": 7.504624811977028e-05,
+      "loss": 0.6155,
+      "step": 3335
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.363834113087486,
+      "learning_rate": 7.499049330097927e-05,
+      "loss": 0.5861,
+      "step": 3336
+    },
+    {
+      "epoch": 0.5932444444444445,
+      "grad_norm": 0.3514957916817266,
+      "learning_rate": 7.493474677412794e-05,
+      "loss": 0.634,
+      "step": 3337
+    },
+    {
+      "epoch": 0.5934222222222222,
+      "grad_norm": 0.3558700634123441,
+      "learning_rate": 7.48790085576992e-05,
+      "loss": 0.5639,
+      "step": 3338
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.37334624206829015,
+      "learning_rate": 7.482327867017306e-05,
+      "loss": 0.5887,
+      "step": 3339
+    },
+    {
+      "epoch": 0.5937777777777777,
+      "grad_norm": 0.35003336764712384,
+      "learning_rate": 7.476755713002694e-05,
+      "loss": 0.6204,
+      "step": 3340
+    },
+    {
+      "epoch": 0.5939555555555556,
+      "grad_norm": 0.378493575947033,
+      "learning_rate": 7.471184395573534e-05,
+      "loss": 0.6048,
+      "step": 3341
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.3567038497714876,
+      "learning_rate": 7.465613916577004e-05,
+      "loss": 0.6278,
+      "step": 3342
+    },
+    {
+      "epoch": 0.5943111111111111,
+      "grad_norm": 0.3650714277703017,
+      "learning_rate": 7.460044277860008e-05,
+      "loss": 0.5862,
+      "step": 3343
+    },
+    {
+      "epoch": 0.5944888888888888,
+      "grad_norm": 0.34832197723751246,
+      "learning_rate": 7.454475481269168e-05,
+      "loss": 0.5738,
+      "step": 3344
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.360535790624661,
+      "learning_rate": 7.448907528650823e-05,
+      "loss": 0.6069,
+      "step": 3345
+    },
+    {
+      "epoch": 0.5948444444444444,
+      "grad_norm": 0.36761078093416527,
+      "learning_rate": 7.443340421851041e-05,
+      "loss": 0.6151,
+      "step": 3346
+    },
+    {
+      "epoch": 0.5950222222222222,
+      "grad_norm": 0.3637153394302031,
+      "learning_rate": 7.4377741627156e-05,
+      "loss": 0.6238,
+      "step": 3347
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.35466599643827534,
+      "learning_rate": 7.432208753090009e-05,
+      "loss": 0.5968,
+      "step": 3348
+    },
+    {
+      "epoch": 0.5953777777777778,
+      "grad_norm": 0.38957721706421305,
+      "learning_rate": 7.426644194819477e-05,
+      "loss": 0.6335,
+      "step": 3349
+    },
+    {
+      "epoch": 0.5955555555555555,
+      "grad_norm": 0.3585581377397012,
+      "learning_rate": 7.421080489748953e-05,
+      "loss": 0.62,
+      "step": 3350
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.36279571099801644,
+      "learning_rate": 7.415517639723082e-05,
+      "loss": 0.5589,
+      "step": 3351
+    },
+    {
+      "epoch": 0.5959111111111111,
+      "grad_norm": 0.32450819513265844,
+      "learning_rate": 7.409955646586244e-05,
+      "loss": 0.5549,
+      "step": 3352
+    },
+    {
+      "epoch": 0.5960888888888889,
+      "grad_norm": 0.3281875091609394,
+      "learning_rate": 7.40439451218252e-05,
+      "loss": 0.5703,
+      "step": 3353
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.36882856133349584,
+      "learning_rate": 7.398834238355716e-05,
+      "loss": 0.5806,
+      "step": 3354
+    },
+    {
+      "epoch": 0.5964444444444444,
+      "grad_norm": 0.377243790484592,
+      "learning_rate": 7.393274826949346e-05,
+      "loss": 0.5735,
+      "step": 3355
+    },
+    {
+      "epoch": 0.5966222222222223,
+      "grad_norm": 0.38619203339052627,
+      "learning_rate": 7.387716279806647e-05,
+      "loss": 0.6197,
+      "step": 3356
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.37337661605182787,
+      "learning_rate": 7.382158598770554e-05,
+      "loss": 0.5954,
+      "step": 3357
+    },
+    {
+      "epoch": 0.5969777777777778,
+      "grad_norm": 0.3492734538090118,
+      "learning_rate": 7.376601785683736e-05,
+      "loss": 0.5658,
+      "step": 3358
+    },
+    {
+      "epoch": 0.5971555555555556,
+      "grad_norm": 0.34740816457405305,
+      "learning_rate": 7.371045842388552e-05,
+      "loss": 0.5997,
+      "step": 3359
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.3574778027933568,
+      "learning_rate": 7.365490770727093e-05,
+      "loss": 0.589,
+      "step": 3360
+    },
+    {
+      "epoch": 0.5975111111111111,
+      "grad_norm": 0.35794504027300283,
+      "learning_rate": 7.359936572541142e-05,
+      "loss": 0.6222,
+      "step": 3361
+    },
+    {
+      "epoch": 0.5976888888888889,
+      "grad_norm": 0.3410778406720154,
+      "learning_rate": 7.354383249672212e-05,
+      "loss": 0.579,
+      "step": 3362
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.3475162117944813,
+      "learning_rate": 7.348830803961507e-05,
+      "loss": 0.5828,
+      "step": 3363
+    },
+    {
+      "epoch": 0.5980444444444445,
+      "grad_norm": 0.38229664498826715,
+      "learning_rate": 7.343279237249953e-05,
+      "loss": 0.5836,
+      "step": 3364
+    },
+    {
+      "epoch": 0.5982222222222222,
+      "grad_norm": 0.36707632210616165,
+      "learning_rate": 7.337728551378179e-05,
+      "loss": 0.5948,
+      "step": 3365
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3474817079331617,
+      "learning_rate": 7.332178748186525e-05,
+      "loss": 0.5889,
+      "step": 3366
+    },
+    {
+      "epoch": 0.5985777777777778,
+      "grad_norm": 0.3713238049763101,
+      "learning_rate": 7.326629829515033e-05,
+      "loss": 0.6178,
+      "step": 3367
+    },
+    {
+      "epoch": 0.5987555555555556,
+      "grad_norm": 0.36966344233847986,
+      "learning_rate": 7.32108179720346e-05,
+      "loss": 0.6202,
+      "step": 3368
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.37475955569802427,
+      "learning_rate": 7.31553465309126e-05,
+      "loss": 0.6087,
+      "step": 3369
+    },
+    {
+      "epoch": 0.5991111111111111,
+      "grad_norm": 0.3592736417732489,
+      "learning_rate": 7.309988399017602e-05,
+      "loss": 0.6101,
+      "step": 3370
+    },
+    {
+      "epoch": 0.5992888888888889,
+      "grad_norm": 0.34115307886889823,
+      "learning_rate": 7.304443036821347e-05,
+      "loss": 0.5948,
+      "step": 3371
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.3586940032510138,
+      "learning_rate": 7.298898568341079e-05,
+      "loss": 0.5915,
+      "step": 3372
+    },
+    {
+      "epoch": 0.5996444444444444,
+      "grad_norm": 0.36482075692038446,
+      "learning_rate": 7.293354995415063e-05,
+      "loss": 0.5824,
+      "step": 3373
+    },
+    {
+      "epoch": 0.5998222222222223,
+      "grad_norm": 0.3511386745325496,
+      "learning_rate": 7.28781231988129e-05,
+      "loss": 0.5581,
+      "step": 3374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.35392495004983016,
+      "learning_rate": 7.282270543577436e-05,
+      "loss": 0.586,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6001777777777778,
+      "grad_norm": 0.3375085603248464,
+      "learning_rate": 7.276729668340888e-05,
+      "loss": 0.5754,
+      "step": 3376
+    },
+    {
+      "epoch": 0.6003555555555555,
+      "grad_norm": 0.3231489059358228,
+      "learning_rate": 7.271189696008729e-05,
+      "loss": 0.5839,
+      "step": 3377
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.3395463107124685,
+      "learning_rate": 7.265650628417747e-05,
+      "loss": 0.6095,
+      "step": 3378
+    },
+    {
+      "epoch": 0.6007111111111111,
+      "grad_norm": 0.3487272701191673,
+      "learning_rate": 7.260112467404427e-05,
+      "loss": 0.6056,
+      "step": 3379
+    },
+    {
+      "epoch": 0.6008888888888889,
+      "grad_norm": 0.3563527272856261,
+      "learning_rate": 7.254575214804959e-05,
+      "loss": 0.5952,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.3354288791679355,
+      "learning_rate": 7.24903887245522e-05,
+      "loss": 0.586,
+      "step": 3381
+    },
+    {
+      "epoch": 0.6012444444444445,
+      "grad_norm": 0.3591236916224874,
+      "learning_rate": 7.2435034421908e-05,
+      "loss": 0.6042,
+      "step": 3382
+    },
+    {
+      "epoch": 0.6014222222222222,
+      "grad_norm": 0.3480128202901435,
+      "learning_rate": 7.237968925846971e-05,
+      "loss": 0.6178,
+      "step": 3383
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3582522058815753,
+      "learning_rate": 7.23243532525872e-05,
+      "loss": 0.6054,
+      "step": 3384
+    },
+    {
+      "epoch": 0.6017777777777777,
+      "grad_norm": 0.34547437305602535,
+      "learning_rate": 7.226902642260711e-05,
+      "loss": 0.583,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6019555555555556,
+      "grad_norm": 0.3728007066025142,
+      "learning_rate": 7.221370878687324e-05,
+      "loss": 0.6072,
+      "step": 3386
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.36040920892315237,
+      "learning_rate": 7.215840036372611e-05,
+      "loss": 0.5431,
+      "step": 3387
+    },
+    {
+      "epoch": 0.6023111111111111,
+      "grad_norm": 0.3702255694427694,
+      "learning_rate": 7.210310117150342e-05,
+      "loss": 0.6339,
+      "step": 3388
+    },
+    {
+      "epoch": 0.6024888888888889,
+      "grad_norm": 0.3365385578466156,
+      "learning_rate": 7.204781122853966e-05,
+      "loss": 0.6097,
+      "step": 3389
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.35864118900041453,
+      "learning_rate": 7.199253055316629e-05,
+      "loss": 0.6183,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6028444444444444,
+      "grad_norm": 0.33154923908210954,
+      "learning_rate": 7.19372591637117e-05,
+      "loss": 0.549,
+      "step": 3391
+    },
+    {
+      "epoch": 0.6030222222222222,
+      "grad_norm": 0.3650137900719657,
+      "learning_rate": 7.188199707850122e-05,
+      "loss": 0.6081,
+      "step": 3392
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.34384244014334475,
+      "learning_rate": 7.182674431585704e-05,
+      "loss": 0.5718,
+      "step": 3393
+    },
+    {
+      "epoch": 0.6033777777777778,
+      "grad_norm": 0.36559871282173134,
+      "learning_rate": 7.177150089409835e-05,
+      "loss": 0.5855,
+      "step": 3394
+    },
+    {
+      "epoch": 0.6035555555555555,
+      "grad_norm": 0.3619185449952785,
+      "learning_rate": 7.171626683154112e-05,
+      "loss": 0.6064,
+      "step": 3395
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.3562468123241234,
+      "learning_rate": 7.166104214649839e-05,
+      "loss": 0.6167,
+      "step": 3396
+    },
+    {
+      "epoch": 0.6039111111111111,
+      "grad_norm": 0.34835039127977774,
+      "learning_rate": 7.160582685727986e-05,
+      "loss": 0.5333,
+      "step": 3397
+    },
+    {
+      "epoch": 0.6040888888888889,
+      "grad_norm": 0.35517134904340303,
+      "learning_rate": 7.155062098219235e-05,
+      "loss": 0.5819,
+      "step": 3398
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.36803369471251707,
+      "learning_rate": 7.149542453953938e-05,
+      "loss": 0.6475,
+      "step": 3399
+    },
+    {
+      "epoch": 0.6044444444444445,
+      "grad_norm": 0.34423511465744333,
+      "learning_rate": 7.144023754762149e-05,
+      "loss": 0.6139,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6046222222222222,
+      "grad_norm": 0.3639381293621342,
+      "learning_rate": 7.138506002473591e-05,
+      "loss": 0.5888,
+      "step": 3401
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3594084399758162,
+      "learning_rate": 7.132989198917692e-05,
+      "loss": 0.5633,
+      "step": 3402
+    },
+    {
+      "epoch": 0.6049777777777777,
+      "grad_norm": 0.35620243783022976,
+      "learning_rate": 7.127473345923554e-05,
+      "loss": 0.6213,
+      "step": 3403
+    },
+    {
+      "epoch": 0.6051555555555556,
+      "grad_norm": 0.37797755027524865,
+      "learning_rate": 7.121958445319965e-05,
+      "loss": 0.6171,
+      "step": 3404
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.3469824290242751,
+      "learning_rate": 7.116444498935396e-05,
+      "loss": 0.5921,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6055111111111111,
+      "grad_norm": 0.37160240787966137,
+      "learning_rate": 7.110931508598011e-05,
+      "loss": 0.6094,
+      "step": 3406
+    },
+    {
+      "epoch": 0.6056888888888889,
+      "grad_norm": 0.3587097698412013,
+      "learning_rate": 7.105419476135643e-05,
+      "loss": 0.6159,
+      "step": 3407
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.33870367471615753,
+      "learning_rate": 7.099908403375823e-05,
+      "loss": 0.5809,
+      "step": 3408
+    },
+    {
+      "epoch": 0.6060444444444445,
+      "grad_norm": 0.3509255640429414,
+      "learning_rate": 7.094398292145746e-05,
+      "loss": 0.5664,
+      "step": 3409
+    },
+    {
+      "epoch": 0.6062222222222222,
+      "grad_norm": 0.35089660964993785,
+      "learning_rate": 7.088889144272305e-05,
+      "loss": 0.605,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3606835979508538,
+      "learning_rate": 7.083380961582062e-05,
+      "loss": 0.5673,
+      "step": 3411
+    },
+    {
+      "epoch": 0.6065777777777778,
+      "grad_norm": 0.35417333026626147,
+      "learning_rate": 7.077873745901269e-05,
+      "loss": 0.5927,
+      "step": 3412
+    },
+    {
+      "epoch": 0.6067555555555556,
+      "grad_norm": 0.3273834645633267,
+      "learning_rate": 7.072367499055844e-05,
+      "loss": 0.5911,
+      "step": 3413
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.35203990821899905,
+      "learning_rate": 7.066862222871397e-05,
+      "loss": 0.59,
+      "step": 3414
+    },
+    {
+      "epoch": 0.6071111111111112,
+      "grad_norm": 0.3301638394687133,
+      "learning_rate": 7.061357919173209e-05,
+      "loss": 0.5655,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6072888888888889,
+      "grad_norm": 0.3650828916902255,
+      "learning_rate": 7.055854589786241e-05,
+      "loss": 0.5676,
+      "step": 3416
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.33755707923075684,
+      "learning_rate": 7.050352236535125e-05,
+      "loss": 0.5728,
+      "step": 3417
+    },
+    {
+      "epoch": 0.6076444444444444,
+      "grad_norm": 0.3496975276716916,
+      "learning_rate": 7.044850861244184e-05,
+      "loss": 0.5374,
+      "step": 3418
+    },
+    {
+      "epoch": 0.6078222222222223,
+      "grad_norm": 0.38579749447468065,
+      "learning_rate": 7.039350465737396e-05,
+      "loss": 0.5871,
+      "step": 3419
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3706742400380307,
+      "learning_rate": 7.033851051838437e-05,
+      "loss": 0.5855,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6081777777777778,
+      "grad_norm": 0.3762731585109549,
+      "learning_rate": 7.028352621370635e-05,
+      "loss": 0.6535,
+      "step": 3421
+    },
+    {
+      "epoch": 0.6083555555555555,
+      "grad_norm": 0.40364872923069495,
+      "learning_rate": 7.022855176157016e-05,
+      "loss": 0.5838,
+      "step": 3422
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.3446107212341191,
+      "learning_rate": 7.01735871802025e-05,
+      "loss": 0.5465,
+      "step": 3423
+    },
+    {
+      "epoch": 0.6087111111111111,
+      "grad_norm": 0.3630382413718293,
+      "learning_rate": 7.011863248782711e-05,
+      "loss": 0.5693,
+      "step": 3424
+    },
+    {
+      "epoch": 0.6088888888888889,
+      "grad_norm": 0.3392069360937096,
+      "learning_rate": 7.006368770266421e-05,
+      "loss": 0.5707,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.34262698514308126,
+      "learning_rate": 7.000875284293087e-05,
+      "loss": 0.5448,
+      "step": 3426
+    },
+    {
+      "epoch": 0.6092444444444445,
+      "grad_norm": 0.3753001426240263,
+      "learning_rate": 6.99538279268408e-05,
+      "loss": 0.5916,
+      "step": 3427
+    },
+    {
+      "epoch": 0.6094222222222222,
+      "grad_norm": 0.3549000141752506,
+      "learning_rate": 6.989891297260445e-05,
+      "loss": 0.6293,
+      "step": 3428
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.36388223808887243,
+      "learning_rate": 6.984400799842894e-05,
+      "loss": 0.6224,
+      "step": 3429
+    },
+    {
+      "epoch": 0.6097777777777778,
+      "grad_norm": 0.35668869119572716,
+      "learning_rate": 6.978911302251816e-05,
+      "loss": 0.561,
+      "step": 3430
+    },
+    {
+      "epoch": 0.6099555555555556,
+      "grad_norm": 0.45875194448495316,
+      "learning_rate": 6.97342280630725e-05,
+      "loss": 0.6046,
+      "step": 3431
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.3817313111470102,
+      "learning_rate": 6.967935313828929e-05,
+      "loss": 0.5896,
+      "step": 3432
+    },
+    {
+      "epoch": 0.6103111111111111,
+      "grad_norm": 0.36625653110145323,
+      "learning_rate": 6.962448826636227e-05,
+      "loss": 0.6292,
+      "step": 3433
+    },
+    {
+      "epoch": 0.6104888888888889,
+      "grad_norm": 0.33666849321718983,
+      "learning_rate": 6.95696334654821e-05,
+      "loss": 0.5515,
+      "step": 3434
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.3716990902826086,
+      "learning_rate": 6.951478875383583e-05,
+      "loss": 0.6169,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6108444444444444,
+      "grad_norm": 0.3612786294761165,
+      "learning_rate": 6.945995414960744e-05,
+      "loss": 0.5923,
+      "step": 3436
+    },
+    {
+      "epoch": 0.6110222222222222,
+      "grad_norm": 0.37804702404560686,
+      "learning_rate": 6.940512967097732e-05,
+      "loss": 0.6244,
+      "step": 3437
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3409226264109023,
+      "learning_rate": 6.93503153361227e-05,
+      "loss": 0.5481,
+      "step": 3438
+    },
+    {
+      "epoch": 0.6113777777777778,
+      "grad_norm": 0.37905874923852434,
+      "learning_rate": 6.929551116321728e-05,
+      "loss": 0.6298,
+      "step": 3439
+    },
+    {
+      "epoch": 0.6115555555555555,
+      "grad_norm": 0.3519272670285671,
+      "learning_rate": 6.92407171704315e-05,
+      "loss": 0.6168,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.3477323318060181,
+      "learning_rate": 6.918593337593238e-05,
+      "loss": 0.5963,
+      "step": 3441
+    },
+    {
+      "epoch": 0.6119111111111111,
+      "grad_norm": 0.36818449007290877,
+      "learning_rate": 6.913115979788361e-05,
+      "loss": 0.6239,
+      "step": 3442
+    },
+    {
+      "epoch": 0.6120888888888889,
+      "grad_norm": 0.38883583130154914,
+      "learning_rate": 6.907639645444536e-05,
+      "loss": 0.5935,
+      "step": 3443
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.3422123624192518,
+      "learning_rate": 6.902164336377461e-05,
+      "loss": 0.563,
+      "step": 3444
+    },
+    {
+      "epoch": 0.6124444444444445,
+      "grad_norm": 0.3601061351890906,
+      "learning_rate": 6.896690054402473e-05,
+      "loss": 0.5782,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6126222222222222,
+      "grad_norm": 0.3441416166286006,
+      "learning_rate": 6.891216801334588e-05,
+      "loss": 0.5841,
+      "step": 3446
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.3975417759059079,
+      "learning_rate": 6.885744578988463e-05,
+      "loss": 0.6506,
+      "step": 3447
+    },
+    {
+      "epoch": 0.6129777777777777,
+      "grad_norm": 0.341448084704136,
+      "learning_rate": 6.88027338917843e-05,
+      "loss": 0.5624,
+      "step": 3448
+    },
+    {
+      "epoch": 0.6131555555555556,
+      "grad_norm": 0.4099521071865417,
+      "learning_rate": 6.874803233718459e-05,
+      "loss": 0.585,
+      "step": 3449
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.37124650499283024,
+      "learning_rate": 6.869334114422199e-05,
+      "loss": 0.6219,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6135111111111111,
+      "grad_norm": 0.36201958031038717,
+      "learning_rate": 6.863866033102939e-05,
+      "loss": 0.5911,
+      "step": 3451
+    },
+    {
+      "epoch": 0.6136888888888888,
+      "grad_norm": 0.3449753701435678,
+      "learning_rate": 6.858398991573631e-05,
+      "loss": 0.5899,
+      "step": 3452
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.34640689141990855,
+      "learning_rate": 6.852932991646881e-05,
+      "loss": 0.5842,
+      "step": 3453
+    },
+    {
+      "epoch": 0.6140444444444444,
+      "grad_norm": 0.34622564944822953,
+      "learning_rate": 6.847468035134951e-05,
+      "loss": 0.5956,
+      "step": 3454
+    },
+    {
+      "epoch": 0.6142222222222222,
+      "grad_norm": 0.3634386258832327,
+      "learning_rate": 6.842004123849752e-05,
+      "loss": 0.621,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.34729536366957653,
+      "learning_rate": 6.836541259602856e-05,
+      "loss": 0.5592,
+      "step": 3456
+    },
+    {
+      "epoch": 0.6145777777777778,
+      "grad_norm": 0.3569812522628856,
+      "learning_rate": 6.83107944420548e-05,
+      "loss": 0.614,
+      "step": 3457
+    },
+    {
+      "epoch": 0.6147555555555556,
+      "grad_norm": 0.33679433365634,
+      "learning_rate": 6.825618679468502e-05,
+      "loss": 0.5676,
+      "step": 3458
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.33882201084539326,
+      "learning_rate": 6.820158967202439e-05,
+      "loss": 0.5212,
+      "step": 3459
+    },
+    {
+      "epoch": 0.6151111111111112,
+      "grad_norm": 0.35537078550186557,
+      "learning_rate": 6.814700309217476e-05,
+      "loss": 0.5638,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6152888888888889,
+      "grad_norm": 0.352661724425596,
+      "learning_rate": 6.809242707323432e-05,
+      "loss": 0.5938,
+      "step": 3461
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.36952325119547186,
+      "learning_rate": 6.80378616332979e-05,
+      "loss": 0.5615,
+      "step": 3462
+    },
+    {
+      "epoch": 0.6156444444444444,
+      "grad_norm": 0.38565201108993036,
+      "learning_rate": 6.79833067904567e-05,
+      "loss": 0.5787,
+      "step": 3463
+    },
+    {
+      "epoch": 0.6158222222222223,
+      "grad_norm": 0.3541276041447201,
+      "learning_rate": 6.792876256279846e-05,
+      "loss": 0.5896,
+      "step": 3464
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.39347866147464766,
+      "learning_rate": 6.787422896840743e-05,
+      "loss": 0.5866,
+      "step": 3465
+    },
+    {
+      "epoch": 0.6161777777777778,
+      "grad_norm": 0.3376416182067583,
+      "learning_rate": 6.781970602536432e-05,
+      "loss": 0.5613,
+      "step": 3466
+    },
+    {
+      "epoch": 0.6163555555555555,
+      "grad_norm": 0.35782104298073997,
+      "learning_rate": 6.776519375174621e-05,
+      "loss": 0.59,
+      "step": 3467
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.34311783171678184,
+      "learning_rate": 6.771069216562684e-05,
+      "loss": 0.5509,
+      "step": 3468
+    },
+    {
+      "epoch": 0.6167111111111111,
+      "grad_norm": 0.35297933042292007,
+      "learning_rate": 6.765620128507619e-05,
+      "loss": 0.5925,
+      "step": 3469
+    },
+    {
+      "epoch": 0.6168888888888889,
+      "grad_norm": 0.36537623858666873,
+      "learning_rate": 6.76017211281609e-05,
+      "loss": 0.6049,
+      "step": 3470
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.3722809872677141,
+      "learning_rate": 6.754725171294382e-05,
+      "loss": 0.5862,
+      "step": 3471
+    },
+    {
+      "epoch": 0.6172444444444445,
+      "grad_norm": 0.35907129238545,
+      "learning_rate": 6.749279305748448e-05,
+      "loss": 0.5986,
+      "step": 3472
+    },
+    {
+      "epoch": 0.6174222222222222,
+      "grad_norm": 0.3620021998753497,
+      "learning_rate": 6.743834517983865e-05,
+      "loss": 0.5954,
+      "step": 3473
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3555134466831354,
+      "learning_rate": 6.73839080980587e-05,
+      "loss": 0.5907,
+      "step": 3474
+    },
+    {
+      "epoch": 0.6177777777777778,
+      "grad_norm": 0.3405314946741367,
+      "learning_rate": 6.732948183019324e-05,
+      "loss": 0.5286,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6179555555555556,
+      "grad_norm": 0.3419335786782383,
+      "learning_rate": 6.727506639428739e-05,
+      "loss": 0.5783,
+      "step": 3476
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.3608614890871183,
+      "learning_rate": 6.72206618083827e-05,
+      "loss": 0.6059,
+      "step": 3477
+    },
+    {
+      "epoch": 0.6183111111111111,
+      "grad_norm": 0.350330236392432,
+      "learning_rate": 6.71662680905171e-05,
+      "loss": 0.6079,
+      "step": 3478
+    },
+    {
+      "epoch": 0.6184888888888889,
+      "grad_norm": 0.3701563113973166,
+      "learning_rate": 6.711188525872486e-05,
+      "loss": 0.5895,
+      "step": 3479
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.3459818472425109,
+      "learning_rate": 6.705751333103675e-05,
+      "loss": 0.5937,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6188444444444444,
+      "grad_norm": 0.34377589387540275,
+      "learning_rate": 6.700315232547981e-05,
+      "loss": 0.6201,
+      "step": 3481
+    },
+    {
+      "epoch": 0.6190222222222223,
+      "grad_norm": 0.3481638400587967,
+      "learning_rate": 6.694880226007757e-05,
+      "loss": 0.5828,
+      "step": 3482
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.35083915324440135,
+      "learning_rate": 6.689446315284981e-05,
+      "loss": 0.5639,
+      "step": 3483
+    },
+    {
+      "epoch": 0.6193777777777778,
+      "grad_norm": 0.36772858049589496,
+      "learning_rate": 6.684013502181281e-05,
+      "loss": 0.5576,
+      "step": 3484
+    },
+    {
+      "epoch": 0.6195555555555555,
+      "grad_norm": 0.35895734465117346,
+      "learning_rate": 6.678581788497908e-05,
+      "loss": 0.5583,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.36488448897219433,
+      "learning_rate": 6.673151176035762e-05,
+      "loss": 0.586,
+      "step": 3486
+    },
+    {
+      "epoch": 0.6199111111111111,
+      "grad_norm": 0.3556208053970919,
+      "learning_rate": 6.667721666595365e-05,
+      "loss": 0.6065,
+      "step": 3487
+    },
+    {
+      "epoch": 0.6200888888888889,
+      "grad_norm": 0.3484665544452661,
+      "learning_rate": 6.662293261976882e-05,
+      "loss": 0.6213,
+      "step": 3488
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.358535263140265,
+      "learning_rate": 6.656865963980105e-05,
+      "loss": 0.5572,
+      "step": 3489
+    },
+    {
+      "epoch": 0.6204444444444445,
+      "grad_norm": 0.3524276919910799,
+      "learning_rate": 6.651439774404471e-05,
+      "loss": 0.5964,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6206222222222222,
+      "grad_norm": 0.36854678642307076,
+      "learning_rate": 6.64601469504903e-05,
+      "loss": 0.6039,
+      "step": 3491
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3528405841598993,
+      "learning_rate": 6.640590727712485e-05,
+      "loss": 0.6008,
+      "step": 3492
+    },
+    {
+      "epoch": 0.6209777777777777,
+      "grad_norm": 0.37787793788798146,
+      "learning_rate": 6.635167874193153e-05,
+      "loss": 0.6022,
+      "step": 3493
+    },
+    {
+      "epoch": 0.6211555555555556,
+      "grad_norm": 0.3628039732841485,
+      "learning_rate": 6.629746136288997e-05,
+      "loss": 0.6084,
+      "step": 3494
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.36982780314785374,
+      "learning_rate": 6.624325515797593e-05,
+      "loss": 0.6215,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6215111111111111,
+      "grad_norm": 0.339044758060382,
+      "learning_rate": 6.618906014516168e-05,
+      "loss": 0.6094,
+      "step": 3496
+    },
+    {
+      "epoch": 0.6216888888888888,
+      "grad_norm": 0.3498626307348065,
+      "learning_rate": 6.613487634241553e-05,
+      "loss": 0.6208,
+      "step": 3497
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.3380632226383569,
+      "learning_rate": 6.608070376770231e-05,
+      "loss": 0.5599,
+      "step": 3498
+    },
+    {
+      "epoch": 0.6220444444444444,
+      "grad_norm": 0.3309575532400716,
+      "learning_rate": 6.602654243898294e-05,
+      "loss": 0.5719,
+      "step": 3499
+    },
+    {
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.36177064999756525,
+      "learning_rate": 6.597239237421476e-05,
+      "loss": 0.6276,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3604754376620941,
+      "learning_rate": 6.591825359135123e-05,
+      "loss": 0.5731,
+      "step": 3501
+    },
+    {
+      "epoch": 0.6225777777777778,
+      "grad_norm": 0.3775551538387199,
+      "learning_rate": 6.586412610834221e-05,
+      "loss": 0.6182,
+      "step": 3502
+    },
+    {
+      "epoch": 0.6227555555555555,
+      "grad_norm": 0.39689345077170296,
+      "learning_rate": 6.581000994313369e-05,
+      "loss": 0.5862,
+      "step": 3503
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.4202047682946692,
+      "learning_rate": 6.575590511366804e-05,
+      "loss": 0.6513,
+      "step": 3504
+    },
+    {
+      "epoch": 0.6231111111111111,
+      "grad_norm": 0.37963083087491084,
+      "learning_rate": 6.57018116378837e-05,
+      "loss": 0.608,
+      "step": 3505
+    },
+    {
+      "epoch": 0.6232888888888889,
+      "grad_norm": 0.3546949873482942,
+      "learning_rate": 6.564772953371555e-05,
+      "loss": 0.6026,
+      "step": 3506
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.34509642068842444,
+      "learning_rate": 6.55936588190945e-05,
+      "loss": 0.5632,
+      "step": 3507
+    },
+    {
+      "epoch": 0.6236444444444444,
+      "grad_norm": 0.3668139779251502,
+      "learning_rate": 6.553959951194787e-05,
+      "loss": 0.5937,
+      "step": 3508
+    },
+    {
+      "epoch": 0.6238222222222222,
+      "grad_norm": 0.34196327178857566,
+      "learning_rate": 6.5485551630199e-05,
+      "loss": 0.5792,
+      "step": 3509
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.340771113425655,
+      "learning_rate": 6.543151519176764e-05,
+      "loss": 0.6075,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6241777777777778,
+      "grad_norm": 0.37371294048224785,
+      "learning_rate": 6.537749021456959e-05,
+      "loss": 0.6254,
+      "step": 3511
+    },
+    {
+      "epoch": 0.6243555555555556,
+      "grad_norm": 0.3472025361223291,
+      "learning_rate": 6.532347671651697e-05,
+      "loss": 0.5427,
+      "step": 3512
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.3597057240268955,
+      "learning_rate": 6.526947471551798e-05,
+      "loss": 0.6071,
+      "step": 3513
+    },
+    {
+      "epoch": 0.6247111111111111,
+      "grad_norm": 0.3589264481249089,
+      "learning_rate": 6.521548422947709e-05,
+      "loss": 0.5886,
+      "step": 3514
+    },
+    {
+      "epoch": 0.6248888888888889,
+      "grad_norm": 0.5912673206431269,
+      "learning_rate": 6.516150527629495e-05,
+      "loss": 0.5546,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.3424699171545662,
+      "learning_rate": 6.510753787386831e-05,
+      "loss": 0.6057,
+      "step": 3516
+    },
+    {
+      "epoch": 0.6252444444444445,
+      "grad_norm": 0.3553341428033302,
+      "learning_rate": 6.505358204009017e-05,
+      "loss": 0.6527,
+      "step": 3517
+    },
+    {
+      "epoch": 0.6254222222222222,
+      "grad_norm": 0.34647811154723,
+      "learning_rate": 6.499963779284971e-05,
+      "loss": 0.5721,
+      "step": 3518
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.3827205530879716,
+      "learning_rate": 6.494570515003214e-05,
+      "loss": 0.5961,
+      "step": 3519
+    },
+    {
+      "epoch": 0.6257777777777778,
+      "grad_norm": 0.3572996619245539,
+      "learning_rate": 6.489178412951899e-05,
+      "loss": 0.6128,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6259555555555556,
+      "grad_norm": 0.3358120218729844,
+      "learning_rate": 6.483787474918779e-05,
+      "loss": 0.5271,
+      "step": 3521
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.37989551295032553,
+      "learning_rate": 6.478397702691236e-05,
+      "loss": 0.6234,
+      "step": 3522
+    },
+    {
+      "epoch": 0.6263111111111112,
+      "grad_norm": 0.36223898724996817,
+      "learning_rate": 6.473009098056246e-05,
+      "loss": 0.5742,
+      "step": 3523
+    },
+    {
+      "epoch": 0.6264888888888889,
+      "grad_norm": 0.37495641852335404,
+      "learning_rate": 6.46762166280042e-05,
+      "loss": 0.5455,
+      "step": 3524
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.36508614808648054,
+      "learning_rate": 6.462235398709963e-05,
+      "loss": 0.5958,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6268444444444444,
+      "grad_norm": 0.35860944822508845,
+      "learning_rate": 6.456850307570704e-05,
+      "loss": 0.6057,
+      "step": 3526
+    },
+    {
+      "epoch": 0.6270222222222223,
+      "grad_norm": 0.34764076097932783,
+      "learning_rate": 6.451466391168072e-05,
+      "loss": 0.5716,
+      "step": 3527
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.37072155796838124,
+      "learning_rate": 6.44608365128712e-05,
+      "loss": 0.5797,
+      "step": 3528
+    },
+    {
+      "epoch": 0.6273777777777778,
+      "grad_norm": 0.3657179908443539,
+      "learning_rate": 6.440702089712494e-05,
+      "loss": 0.5646,
+      "step": 3529
+    },
+    {
+      "epoch": 0.6275555555555555,
+      "grad_norm": 0.3665192780348969,
+      "learning_rate": 6.43532170822847e-05,
+      "loss": 0.5686,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.38878454090009323,
+      "learning_rate": 6.429942508618911e-05,
+      "loss": 0.6099,
+      "step": 3531
+    },
+    {
+      "epoch": 0.6279111111111111,
+      "grad_norm": 0.3571545416145081,
+      "learning_rate": 6.424564492667309e-05,
+      "loss": 0.618,
+      "step": 3532
+    },
+    {
+      "epoch": 0.6280888888888889,
+      "grad_norm": 0.4969865001081727,
+      "learning_rate": 6.419187662156743e-05,
+      "loss": 0.6076,
+      "step": 3533
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.3473146792785242,
+      "learning_rate": 6.413812018869918e-05,
+      "loss": 0.6025,
+      "step": 3534
+    },
+    {
+      "epoch": 0.6284444444444445,
+      "grad_norm": 0.34805733210335354,
+      "learning_rate": 6.40843756458913e-05,
+      "loss": 0.5854,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6286222222222222,
+      "grad_norm": 0.3239626108847237,
+      "learning_rate": 6.403064301096294e-05,
+      "loss": 0.5545,
+      "step": 3536
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.3802899492632204,
+      "learning_rate": 6.397692230172918e-05,
+      "loss": 0.6472,
+      "step": 3537
+    },
+    {
+      "epoch": 0.6289777777777777,
+      "grad_norm": 0.3679740450769246,
+      "learning_rate": 6.392321353600124e-05,
+      "loss": 0.5777,
+      "step": 3538
+    },
+    {
+      "epoch": 0.6291555555555556,
+      "grad_norm": 0.380688082322288,
+      "learning_rate": 6.386951673158629e-05,
+      "loss": 0.6141,
+      "step": 3539
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.3916850997059995,
+      "learning_rate": 6.381583190628768e-05,
+      "loss": 0.5801,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6295111111111111,
+      "grad_norm": 0.3609732841254345,
+      "learning_rate": 6.376215907790458e-05,
+      "loss": 0.5754,
+      "step": 3541
+    },
+    {
+      "epoch": 0.6296888888888889,
+      "grad_norm": 0.3728953022456657,
+      "learning_rate": 6.37084982642324e-05,
+      "loss": 0.5875,
+      "step": 3542
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.3442642696488891,
+      "learning_rate": 6.365484948306237e-05,
+      "loss": 0.6076,
+      "step": 3543
+    },
+    {
+      "epoch": 0.6300444444444444,
+      "grad_norm": 0.35495526276676037,
+      "learning_rate": 6.360121275218191e-05,
+      "loss": 0.6145,
+      "step": 3544
+    },
+    {
+      "epoch": 0.6302222222222222,
+      "grad_norm": 0.3622861146244761,
+      "learning_rate": 6.35475880893743e-05,
+      "loss": 0.6028,
+      "step": 3545
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.34979537722006593,
+      "learning_rate": 6.349397551241894e-05,
+      "loss": 0.5677,
+      "step": 3546
+    },
+    {
+      "epoch": 0.6305777777777778,
+      "grad_norm": 0.3563191619482736,
+      "learning_rate": 6.344037503909107e-05,
+      "loss": 0.593,
+      "step": 3547
+    },
+    {
+      "epoch": 0.6307555555555555,
+      "grad_norm": 0.35477512949167805,
+      "learning_rate": 6.338678668716209e-05,
+      "loss": 0.6049,
+      "step": 3548
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.35439656579729806,
+      "learning_rate": 6.333321047439925e-05,
+      "loss": 0.5905,
+      "step": 3549
+    },
+    {
+      "epoch": 0.6311111111111111,
+      "grad_norm": 0.35037068289693934,
+      "learning_rate": 6.327964641856585e-05,
+      "loss": 0.5684,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6312888888888889,
+      "grad_norm": 0.36293152726456257,
+      "learning_rate": 6.322609453742113e-05,
+      "loss": 0.6106,
+      "step": 3551
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.3523435466776201,
+      "learning_rate": 6.317255484872028e-05,
+      "loss": 0.5698,
+      "step": 3552
+    },
+    {
+      "epoch": 0.6316444444444445,
+      "grad_norm": 0.34265393737932,
+      "learning_rate": 6.311902737021447e-05,
+      "loss": 0.5689,
+      "step": 3553
+    },
+    {
+      "epoch": 0.6318222222222222,
+      "grad_norm": 0.3501369098553959,
+      "learning_rate": 6.306551211965087e-05,
+      "loss": 0.6044,
+      "step": 3554
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.35140657805071,
+      "learning_rate": 6.301200911477243e-05,
+      "loss": 0.6326,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6321777777777777,
+      "grad_norm": 0.39085407842069364,
+      "learning_rate": 6.295851837331826e-05,
+      "loss": 0.6193,
+      "step": 3556
+    },
+    {
+      "epoch": 0.6323555555555556,
+      "grad_norm": 0.35576980451479745,
+      "learning_rate": 6.290503991302324e-05,
+      "loss": 0.6019,
+      "step": 3557
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.7752817910640124,
+      "learning_rate": 6.285157375161825e-05,
+      "loss": 0.6049,
+      "step": 3558
+    },
+    {
+      "epoch": 0.6327111111111111,
+      "grad_norm": 0.3646257448803145,
+      "learning_rate": 6.279811990683006e-05,
+      "loss": 0.5776,
+      "step": 3559
+    },
+    {
+      "epoch": 0.6328888888888888,
+      "grad_norm": 0.3551345148143536,
+      "learning_rate": 6.274467839638142e-05,
+      "loss": 0.572,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.36103613728553446,
+      "learning_rate": 6.26912492379909e-05,
+      "loss": 0.5909,
+      "step": 3561
+    },
+    {
+      "epoch": 0.6332444444444445,
+      "grad_norm": 0.33590653824655003,
+      "learning_rate": 6.2637832449373e-05,
+      "loss": 0.5457,
+      "step": 3562
+    },
+    {
+      "epoch": 0.6334222222222222,
+      "grad_norm": 0.3584554705871418,
+      "learning_rate": 6.258442804823818e-05,
+      "loss": 0.5899,
+      "step": 3563
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.33794756157224276,
+      "learning_rate": 6.253103605229279e-05,
+      "loss": 0.5733,
+      "step": 3564
+    },
+    {
+      "epoch": 0.6337777777777778,
+      "grad_norm": 0.34775818070893616,
+      "learning_rate": 6.24776564792389e-05,
+      "loss": 0.573,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6339555555555556,
+      "grad_norm": 0.3587005246754528,
+      "learning_rate": 6.242428934677469e-05,
+      "loss": 0.6147,
+      "step": 3566
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.34490202538297937,
+      "learning_rate": 6.237093467259406e-05,
+      "loss": 0.5555,
+      "step": 3567
+    },
+    {
+      "epoch": 0.6343111111111112,
+      "grad_norm": 0.3442695358512481,
+      "learning_rate": 6.231759247438689e-05,
+      "loss": 0.6082,
+      "step": 3568
+    },
+    {
+      "epoch": 0.6344888888888889,
+      "grad_norm": 0.35557839116978457,
+      "learning_rate": 6.22642627698388e-05,
+      "loss": 0.5502,
+      "step": 3569
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.3512858908955408,
+      "learning_rate": 6.22109455766314e-05,
+      "loss": 0.6629,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6348444444444444,
+      "grad_norm": 0.3492902926188577,
+      "learning_rate": 6.215764091244202e-05,
+      "loss": 0.5978,
+      "step": 3571
+    },
+    {
+      "epoch": 0.6350222222222223,
+      "grad_norm": 0.3346870877084782,
+      "learning_rate": 6.210434879494398e-05,
+      "loss": 0.607,
+      "step": 3572
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.33149197422111853,
+      "learning_rate": 6.205106924180628e-05,
+      "loss": 0.5434,
+      "step": 3573
+    },
+    {
+      "epoch": 0.6353777777777778,
+      "grad_norm": 0.3360393627377263,
+      "learning_rate": 6.19978022706939e-05,
+      "loss": 0.5503,
+      "step": 3574
+    },
+    {
+      "epoch": 0.6355555555555555,
+      "grad_norm": 0.3511337103197416,
+      "learning_rate": 6.194454789926753e-05,
+      "loss": 0.6064,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.343008834351783,
+      "learning_rate": 6.18913061451838e-05,
+      "loss": 0.5664,
+      "step": 3576
+    },
+    {
+      "epoch": 0.6359111111111111,
+      "grad_norm": 0.3586285495123366,
+      "learning_rate": 6.183807702609502e-05,
+      "loss": 0.5865,
+      "step": 3577
+    },
+    {
+      "epoch": 0.6360888888888889,
+      "grad_norm": 0.542675413299601,
+      "learning_rate": 6.178486055964945e-05,
+      "loss": 0.587,
+      "step": 3578
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.3560700431742999,
+      "learning_rate": 6.173165676349103e-05,
+      "loss": 0.5908,
+      "step": 3579
+    },
+    {
+      "epoch": 0.6364444444444445,
+      "grad_norm": 0.35079808482782393,
+      "learning_rate": 6.167846565525959e-05,
+      "loss": 0.5714,
+      "step": 3580
+    },
+    {
+      "epoch": 0.6366222222222222,
+      "grad_norm": 0.3426357295417441,
+      "learning_rate": 6.162528725259078e-05,
+      "loss": 0.5869,
+      "step": 3581
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.35842479969212593,
+      "learning_rate": 6.157212157311587e-05,
+      "loss": 0.5887,
+      "step": 3582
+    },
+    {
+      "epoch": 0.6369777777777778,
+      "grad_norm": 0.3355367005280455,
+      "learning_rate": 6.151896863446213e-05,
+      "loss": 0.5042,
+      "step": 3583
+    },
+    {
+      "epoch": 0.6371555555555556,
+      "grad_norm": 0.3840250612265355,
+      "learning_rate": 6.146582845425242e-05,
+      "loss": 0.6047,
+      "step": 3584
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.3433237126454255,
+      "learning_rate": 6.141270105010546e-05,
+      "loss": 0.6181,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6375111111111111,
+      "grad_norm": 0.3499383746671317,
+      "learning_rate": 6.135958643963572e-05,
+      "loss": 0.5421,
+      "step": 3586
+    },
+    {
+      "epoch": 0.6376888888888889,
+      "grad_norm": 0.35382809154052436,
+      "learning_rate": 6.130648464045347e-05,
+      "loss": 0.575,
+      "step": 3587
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.3518188382626932,
+      "learning_rate": 6.125339567016463e-05,
+      "loss": 0.6014,
+      "step": 3588
+    },
+    {
+      "epoch": 0.6380444444444444,
+      "grad_norm": 0.37384338260400213,
+      "learning_rate": 6.120031954637101e-05,
+      "loss": 0.586,
+      "step": 3589
+    },
+    {
+      "epoch": 0.6382222222222222,
+      "grad_norm": 0.3653998650388993,
+      "learning_rate": 6.114725628666998e-05,
+      "loss": 0.5737,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.37182084882117417,
+      "learning_rate": 6.109420590865483e-05,
+      "loss": 0.6172,
+      "step": 3591
+    },
+    {
+      "epoch": 0.6385777777777778,
+      "grad_norm": 0.3400798485017609,
+      "learning_rate": 6.104116842991441e-05,
+      "loss": 0.529,
+      "step": 3592
+    },
+    {
+      "epoch": 0.6387555555555555,
+      "grad_norm": 0.3426619492386837,
+      "learning_rate": 6.098814386803347e-05,
+      "loss": 0.6251,
+      "step": 3593
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.3501116711393118,
+      "learning_rate": 6.0935132240592295e-05,
+      "loss": 0.5691,
+      "step": 3594
+    },
+    {
+      "epoch": 0.6391111111111111,
+      "grad_norm": 0.37362110136957377,
+      "learning_rate": 6.0882133565167055e-05,
+      "loss": 0.6166,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6392888888888889,
+      "grad_norm": 0.34956877901583,
+      "learning_rate": 6.082914785932947e-05,
+      "loss": 0.5668,
+      "step": 3596
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.3425135692081391,
+      "learning_rate": 6.0776175140647064e-05,
+      "loss": 0.5861,
+      "step": 3597
+    },
+    {
+      "epoch": 0.6396444444444445,
+      "grad_norm": 0.3633986640495545,
+      "learning_rate": 6.072321542668301e-05,
+      "loss": 0.6178,
+      "step": 3598
+    },
+    {
+      "epoch": 0.6398222222222222,
+      "grad_norm": 0.35718099890882815,
+      "learning_rate": 6.067026873499622e-05,
+      "loss": 0.5801,
+      "step": 3599
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3428943765469151,
+      "learning_rate": 6.061733508314116e-05,
+      "loss": 0.5551,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6401777777777777,
+      "grad_norm": 0.3620896830376724,
+      "learning_rate": 6.0564414488668165e-05,
+      "loss": 0.5534,
+      "step": 3601
+    },
+    {
+      "epoch": 0.6403555555555556,
+      "grad_norm": 0.35713715218538716,
+      "learning_rate": 6.0511506969123044e-05,
+      "loss": 0.6019,
+      "step": 3602
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.3399310349375786,
+      "learning_rate": 6.0458612542047456e-05,
+      "loss": 0.5524,
+      "step": 3603
+    },
+    {
+      "epoch": 0.6407111111111111,
+      "grad_norm": 0.35280661807942976,
+      "learning_rate": 6.0405731224978546e-05,
+      "loss": 0.5915,
+      "step": 3604
+    },
+    {
+      "epoch": 0.6408888888888888,
+      "grad_norm": 0.3624197279910715,
+      "learning_rate": 6.035286303544927e-05,
+      "loss": 0.5965,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.37170746095734314,
+      "learning_rate": 6.030000799098808e-05,
+      "loss": 0.596,
+      "step": 3606
+    },
+    {
+      "epoch": 0.6412444444444444,
+      "grad_norm": 0.34632664950674946,
+      "learning_rate": 6.024716610911924e-05,
+      "loss": 0.5892,
+      "step": 3607
+    },
+    {
+      "epoch": 0.6414222222222222,
+      "grad_norm": 0.3471931167338859,
+      "learning_rate": 6.0194337407362466e-05,
+      "loss": 0.5871,
+      "step": 3608
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.3570258689320991,
+      "learning_rate": 6.0141521903233235e-05,
+      "loss": 0.6014,
+      "step": 3609
+    },
+    {
+      "epoch": 0.6417777777777778,
+      "grad_norm": 0.38872224192835486,
+      "learning_rate": 6.008871961424258e-05,
+      "loss": 0.5682,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6419555555555555,
+      "grad_norm": 0.3795415590392823,
+      "learning_rate": 6.003593055789725e-05,
+      "loss": 0.604,
+      "step": 3611
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.35418504513328236,
+      "learning_rate": 5.998315475169942e-05,
+      "loss": 0.5997,
+      "step": 3612
+    },
+    {
+      "epoch": 0.6423111111111112,
+      "grad_norm": 0.35941259139829723,
+      "learning_rate": 5.9930392213147116e-05,
+      "loss": 0.592,
+      "step": 3613
+    },
+    {
+      "epoch": 0.6424888888888889,
+      "grad_norm": 0.35565618143860017,
+      "learning_rate": 5.987764295973373e-05,
+      "loss": 0.6069,
+      "step": 3614
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.3672921698113864,
+      "learning_rate": 5.982490700894844e-05,
+      "loss": 0.6058,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6428444444444444,
+      "grad_norm": 0.36360234288234605,
+      "learning_rate": 5.9772184378275854e-05,
+      "loss": 0.5763,
+      "step": 3616
+    },
+    {
+      "epoch": 0.6430222222222223,
+      "grad_norm": 0.34416994375996074,
+      "learning_rate": 5.971947508519631e-05,
+      "loss": 0.5716,
+      "step": 3617
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3785566191898189,
+      "learning_rate": 5.9666779147185593e-05,
+      "loss": 0.5764,
+      "step": 3618
+    },
+    {
+      "epoch": 0.6433777777777778,
+      "grad_norm": 0.3449185400170919,
+      "learning_rate": 5.9614096581715196e-05,
+      "loss": 0.5536,
+      "step": 3619
+    },
+    {
+      "epoch": 0.6435555555555555,
+      "grad_norm": 0.36290689660054826,
+      "learning_rate": 5.956142740625203e-05,
+      "loss": 0.5884,
+      "step": 3620
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.35742552147412654,
+      "learning_rate": 5.9508771638258654e-05,
+      "loss": 0.6239,
+      "step": 3621
+    },
+    {
+      "epoch": 0.6439111111111111,
+      "grad_norm": 0.33570459402326264,
+      "learning_rate": 5.94561292951932e-05,
+      "loss": 0.5522,
+      "step": 3622
+    },
+    {
+      "epoch": 0.6440888888888889,
+      "grad_norm": 0.3439704717730226,
+      "learning_rate": 5.94035003945093e-05,
+      "loss": 0.6095,
+      "step": 3623
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.33866081392070085,
+      "learning_rate": 5.935088495365613e-05,
+      "loss": 0.5909,
+      "step": 3624
+    },
+    {
+      "epoch": 0.6444444444444445,
+      "grad_norm": 0.3648555868908178,
+      "learning_rate": 5.929828299007845e-05,
+      "loss": 0.6537,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6446222222222222,
+      "grad_norm": 0.3621514256478139,
+      "learning_rate": 5.9245694521216464e-05,
+      "loss": 0.5785,
+      "step": 3626
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.330012553903769,
+      "learning_rate": 5.9193119564506035e-05,
+      "loss": 0.5645,
+      "step": 3627
+    },
+    {
+      "epoch": 0.6449777777777778,
+      "grad_norm": 0.3543822424593375,
+      "learning_rate": 5.914055813737839e-05,
+      "loss": 0.6368,
+      "step": 3628
+    },
+    {
+      "epoch": 0.6451555555555556,
+      "grad_norm": 0.3448404119091526,
+      "learning_rate": 5.908801025726043e-05,
+      "loss": 0.6099,
+      "step": 3629
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.38372058412664345,
+      "learning_rate": 5.90354759415744e-05,
+      "loss": 0.6211,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6455111111111111,
+      "grad_norm": 0.34748770951860847,
+      "learning_rate": 5.898295520773822e-05,
+      "loss": 0.5741,
+      "step": 3631
+    },
+    {
+      "epoch": 0.6456888888888889,
+      "grad_norm": 0.3332256196844322,
+      "learning_rate": 5.893044807316516e-05,
+      "loss": 0.5112,
+      "step": 3632
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.3524257255133568,
+      "learning_rate": 5.8877954555264034e-05,
+      "loss": 0.5948,
+      "step": 3633
+    },
+    {
+      "epoch": 0.6460444444444444,
+      "grad_norm": 0.37536332792058563,
+      "learning_rate": 5.88254746714392e-05,
+      "loss": 0.6007,
+      "step": 3634
+    },
+    {
+      "epoch": 0.6462222222222223,
+      "grad_norm": 0.3401923225619182,
+      "learning_rate": 5.877300843909039e-05,
+      "loss": 0.5933,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3827200268635426,
+      "learning_rate": 5.872055587561287e-05,
+      "loss": 0.6173,
+      "step": 3636
+    },
+    {
+      "epoch": 0.6465777777777778,
+      "grad_norm": 0.33696110007659563,
+      "learning_rate": 5.86681169983974e-05,
+      "loss": 0.6143,
+      "step": 3637
+    },
+    {
+      "epoch": 0.6467555555555555,
+      "grad_norm": 0.3616523708236569,
+      "learning_rate": 5.861569182483013e-05,
+      "loss": 0.5623,
+      "step": 3638
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.330474537152214,
+      "learning_rate": 5.856328037229275e-05,
+      "loss": 0.5439,
+      "step": 3639
+    },
+    {
+      "epoch": 0.6471111111111111,
+      "grad_norm": 0.3570580044650607,
+      "learning_rate": 5.851088265816229e-05,
+      "loss": 0.601,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6472888888888889,
+      "grad_norm": 0.3844020305251516,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.6124,
+      "step": 3641
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.3526083964359886,
+      "learning_rate": 5.8406128514607894e-05,
+      "loss": 0.5936,
+      "step": 3642
+    },
+    {
+      "epoch": 0.6476444444444445,
+      "grad_norm": 0.37011468592008107,
+      "learning_rate": 5.8353772119915376e-05,
+      "loss": 0.5882,
+      "step": 3643
+    },
+    {
+      "epoch": 0.6478222222222222,
+      "grad_norm": 0.3553719660528463,
+      "learning_rate": 5.83014295330925e-05,
+      "loss": 0.5586,
+      "step": 3644
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.36940514675211517,
+      "learning_rate": 5.824910077149371e-05,
+      "loss": 0.6187,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6481777777777777,
+      "grad_norm": 0.37706720400180127,
+      "learning_rate": 5.8196785852468524e-05,
+      "loss": 0.6348,
+      "step": 3646
+    },
+    {
+      "epoch": 0.6483555555555556,
+      "grad_norm": 0.3574009887838271,
+      "learning_rate": 5.8144484793362183e-05,
+      "loss": 0.5933,
+      "step": 3647
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.3494989304118295,
+      "learning_rate": 5.809219761151504e-05,
+      "loss": 0.567,
+      "step": 3648
+    },
+    {
+      "epoch": 0.6487111111111111,
+      "grad_norm": 0.33222398716018736,
+      "learning_rate": 5.803992432426313e-05,
+      "loss": 0.5553,
+      "step": 3649
+    },
+    {
+      "epoch": 0.6488888888888888,
+      "grad_norm": 0.35770588020278876,
+      "learning_rate": 5.798766494893759e-05,
+      "loss": 0.5887,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.34363918052300946,
+      "learning_rate": 5.793541950286525e-05,
+      "loss": 0.5529,
+      "step": 3651
+    },
+    {
+      "epoch": 0.6492444444444444,
+      "grad_norm": 0.40714223195235766,
+      "learning_rate": 5.788318800336805e-05,
+      "loss": 0.5717,
+      "step": 3652
+    },
+    {
+      "epoch": 0.6494222222222222,
+      "grad_norm": 0.35105818741588674,
+      "learning_rate": 5.7830970467763456e-05,
+      "loss": 0.5854,
+      "step": 3653
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.36190994315693215,
+      "learning_rate": 5.777876691336428e-05,
+      "loss": 0.6103,
+      "step": 3654
+    },
+    {
+      "epoch": 0.6497777777777778,
+      "grad_norm": 0.35405315857611974,
+      "learning_rate": 5.772657735747868e-05,
+      "loss": 0.594,
+      "step": 3655
+    },
+    {
+      "epoch": 0.6499555555555555,
+      "grad_norm": 0.34851815309047113,
+      "learning_rate": 5.767440181741019e-05,
+      "loss": 0.5901,
+      "step": 3656
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.33081768470922984,
+      "learning_rate": 5.762224031045769e-05,
+      "loss": 0.569,
+      "step": 3657
+    },
+    {
+      "epoch": 0.6503111111111111,
+      "grad_norm": 0.34795808382731863,
+      "learning_rate": 5.757009285391539e-05,
+      "loss": 0.5517,
+      "step": 3658
+    },
+    {
+      "epoch": 0.6504888888888889,
+      "grad_norm": 0.35188535283666056,
+      "learning_rate": 5.751795946507289e-05,
+      "loss": 0.5943,
+      "step": 3659
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.3416260959050426,
+      "learning_rate": 5.746584016121506e-05,
+      "loss": 0.576,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6508444444444444,
+      "grad_norm": 0.3782479717293079,
+      "learning_rate": 5.7413734959622154e-05,
+      "loss": 0.5989,
+      "step": 3661
+    },
+    {
+      "epoch": 0.6510222222222222,
+      "grad_norm": 0.3640211225542614,
+      "learning_rate": 5.7361643877569726e-05,
+      "loss": 0.5537,
+      "step": 3662
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3611950292266739,
+      "learning_rate": 5.730956693232865e-05,
+      "loss": 0.6005,
+      "step": 3663
+    },
+    {
+      "epoch": 0.6513777777777778,
+      "grad_norm": 0.38188780470835093,
+      "learning_rate": 5.725750414116512e-05,
+      "loss": 0.5875,
+      "step": 3664
+    },
+    {
+      "epoch": 0.6515555555555556,
+      "grad_norm": 0.3658645736037116,
+      "learning_rate": 5.7205455521340664e-05,
+      "loss": 0.56,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.3363545348744477,
+      "learning_rate": 5.715342109011197e-05,
+      "loss": 0.5807,
+      "step": 3666
+    },
+    {
+      "epoch": 0.6519111111111111,
+      "grad_norm": 0.38168650893393086,
+      "learning_rate": 5.710140086473129e-05,
+      "loss": 0.6215,
+      "step": 3667
+    },
+    {
+      "epoch": 0.6520888888888889,
+      "grad_norm": 0.32285625948519464,
+      "learning_rate": 5.704939486244585e-05,
+      "loss": 0.5718,
+      "step": 3668
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.4351810686514931,
+      "learning_rate": 5.699740310049847e-05,
+      "loss": 0.5813,
+      "step": 3669
+    },
+    {
+      "epoch": 0.6524444444444445,
+      "grad_norm": 0.38417927546628566,
+      "learning_rate": 5.694542559612694e-05,
+      "loss": 0.6186,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6526222222222222,
+      "grad_norm": 0.3399410619714422,
+      "learning_rate": 5.689346236656465e-05,
+      "loss": 0.5959,
+      "step": 3671
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3571073390895776,
+      "learning_rate": 5.684151342903992e-05,
+      "loss": 0.5782,
+      "step": 3672
+    },
+    {
+      "epoch": 0.6529777777777778,
+      "grad_norm": 0.3782638001435793,
+      "learning_rate": 5.6789578800776657e-05,
+      "loss": 0.5614,
+      "step": 3673
+    },
+    {
+      "epoch": 0.6531555555555556,
+      "grad_norm": 0.33367554548758843,
+      "learning_rate": 5.6737658498993705e-05,
+      "loss": 0.5749,
+      "step": 3674
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.3605185855518717,
+      "learning_rate": 5.668575254090549e-05,
+      "loss": 0.5976,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6535111111111112,
+      "grad_norm": 0.32927454750553914,
+      "learning_rate": 5.6633860943721376e-05,
+      "loss": 0.5539,
+      "step": 3676
+    },
+    {
+      "epoch": 0.6536888888888889,
+      "grad_norm": 0.3330113145026618,
+      "learning_rate": 5.6581983724646134e-05,
+      "loss": 0.5243,
+      "step": 3677
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.35874514621568315,
+      "learning_rate": 5.653012090087977e-05,
+      "loss": 0.5664,
+      "step": 3678
+    },
+    {
+      "epoch": 0.6540444444444444,
+      "grad_norm": 0.352777141995489,
+      "learning_rate": 5.6478272489617435e-05,
+      "loss": 0.563,
+      "step": 3679
+    },
+    {
+      "epoch": 0.6542222222222223,
+      "grad_norm": 0.360922996740039,
+      "learning_rate": 5.6426438508049586e-05,
+      "loss": 0.602,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.341169596886129,
+      "learning_rate": 5.637461897336185e-05,
+      "loss": 0.5374,
+      "step": 3681
+    },
+    {
+      "epoch": 0.6545777777777778,
+      "grad_norm": 0.33385876548773685,
+      "learning_rate": 5.632281390273504e-05,
+      "loss": 0.6026,
+      "step": 3682
+    },
+    {
+      "epoch": 0.6547555555555555,
+      "grad_norm": 0.3642168212086464,
+      "learning_rate": 5.627102331334525e-05,
+      "loss": 0.5521,
+      "step": 3683
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.3407992070632586,
+      "learning_rate": 5.62192472223637e-05,
+      "loss": 0.5686,
+      "step": 3684
+    },
+    {
+      "epoch": 0.6551111111111111,
+      "grad_norm": 0.3511116529870407,
+      "learning_rate": 5.616748564695684e-05,
+      "loss": 0.5886,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6552888888888889,
+      "grad_norm": 0.33872765751719236,
+      "learning_rate": 5.611573860428631e-05,
+      "loss": 0.5378,
+      "step": 3686
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.35819948424700543,
+      "learning_rate": 5.606400611150889e-05,
+      "loss": 0.571,
+      "step": 3687
+    },
+    {
+      "epoch": 0.6556444444444445,
+      "grad_norm": 0.35103158782362637,
+      "learning_rate": 5.60122881857766e-05,
+      "loss": 0.5859,
+      "step": 3688
+    },
+    {
+      "epoch": 0.6558222222222222,
+      "grad_norm": 0.3611830397005914,
+      "learning_rate": 5.596058484423656e-05,
+      "loss": 0.5793,
+      "step": 3689
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.34265305509760013,
+      "learning_rate": 5.590889610403113e-05,
+      "loss": 0.6121,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6561777777777777,
+      "grad_norm": 0.3347845244091521,
+      "learning_rate": 5.58572219822978e-05,
+      "loss": 0.5493,
+      "step": 3691
+    },
+    {
+      "epoch": 0.6563555555555556,
+      "grad_norm": 0.3514548575455999,
+      "learning_rate": 5.580556249616911e-05,
+      "loss": 0.5787,
+      "step": 3692
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.37089259067215413,
+      "learning_rate": 5.575391766277297e-05,
+      "loss": 0.5918,
+      "step": 3693
+    },
+    {
+      "epoch": 0.6567111111111111,
+      "grad_norm": 0.34641717036089437,
+      "learning_rate": 5.570228749923217e-05,
+      "loss": 0.5519,
+      "step": 3694
+    },
+    {
+      "epoch": 0.6568888888888889,
+      "grad_norm": 0.3349639923342679,
+      "learning_rate": 5.5650672022664896e-05,
+      "loss": 0.5448,
+      "step": 3695
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.33892264641173697,
+      "learning_rate": 5.559907125018421e-05,
+      "loss": 0.5888,
+      "step": 3696
+    },
+    {
+      "epoch": 0.6572444444444444,
+      "grad_norm": 0.3706524664214029,
+      "learning_rate": 5.554748519889858e-05,
+      "loss": 0.658,
+      "step": 3697
+    },
+    {
+      "epoch": 0.6574222222222222,
+      "grad_norm": 0.34529823305343504,
+      "learning_rate": 5.5495913885911265e-05,
+      "loss": 0.5921,
+      "step": 3698
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.3563740637613725,
+      "learning_rate": 5.5444357328320985e-05,
+      "loss": 0.566,
+      "step": 3699
+    },
+    {
+      "epoch": 0.6577777777777778,
+      "grad_norm": 0.3667431109579284,
+      "learning_rate": 5.5392815543221254e-05,
+      "loss": 0.6147,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6579555555555555,
+      "grad_norm": 0.36518416485491273,
+      "learning_rate": 5.534128854770089e-05,
+      "loss": 0.6201,
+      "step": 3701
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.34773657590350704,
+      "learning_rate": 5.528977635884375e-05,
+      "loss": 0.5689,
+      "step": 3702
+    },
+    {
+      "epoch": 0.6583111111111111,
+      "grad_norm": 0.3530176806566833,
+      "learning_rate": 5.5238278993728756e-05,
+      "loss": 0.5857,
+      "step": 3703
+    },
+    {
+      "epoch": 0.6584888888888889,
+      "grad_norm": 0.33495610749260346,
+      "learning_rate": 5.5186796469429956e-05,
+      "loss": 0.5804,
+      "step": 3704
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.35968340807869076,
+      "learning_rate": 5.513532880301645e-05,
+      "loss": 0.5413,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6588444444444445,
+      "grad_norm": 0.37352011675406216,
+      "learning_rate": 5.508387601155243e-05,
+      "loss": 0.625,
+      "step": 3706
+    },
+    {
+      "epoch": 0.6590222222222222,
+      "grad_norm": 0.3568111178401562,
+      "learning_rate": 5.503243811209713e-05,
+      "loss": 0.5753,
+      "step": 3707
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3582129020937405,
+      "learning_rate": 5.498101512170486e-05,
+      "loss": 0.5989,
+      "step": 3708
+    },
+    {
+      "epoch": 0.6593777777777777,
+      "grad_norm": 0.33605525824591304,
+      "learning_rate": 5.4929607057425015e-05,
+      "loss": 0.5224,
+      "step": 3709
+    },
+    {
+      "epoch": 0.6595555555555556,
+      "grad_norm": 0.37212750134613615,
+      "learning_rate": 5.4878213936302e-05,
+      "loss": 0.5551,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.3252409361183586,
+      "learning_rate": 5.4826835775375285e-05,
+      "loss": 0.6087,
+      "step": 3711
+    },
+    {
+      "epoch": 0.6599111111111111,
+      "grad_norm": 0.35620452460367513,
+      "learning_rate": 5.477547259167939e-05,
+      "loss": 0.592,
+      "step": 3712
+    },
+    {
+      "epoch": 0.6600888888888888,
+      "grad_norm": 0.37465822067242516,
+      "learning_rate": 5.4724124402243837e-05,
+      "loss": 0.6187,
+      "step": 3713
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.38319880412547924,
+      "learning_rate": 5.467279122409319e-05,
+      "loss": 0.581,
+      "step": 3714
+    },
+    {
+      "epoch": 0.6604444444444444,
+      "grad_norm": 0.3437437267363973,
+      "learning_rate": 5.46214730742471e-05,
+      "loss": 0.525,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6606222222222222,
+      "grad_norm": 0.35851619945769814,
+      "learning_rate": 5.4570169969720055e-05,
+      "loss": 0.596,
+      "step": 3716
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4593652326437602,
+      "learning_rate": 5.451888192752184e-05,
+      "loss": 0.6269,
+      "step": 3717
+    },
+    {
+      "epoch": 0.6609777777777778,
+      "grad_norm": 0.36749277886033155,
+      "learning_rate": 5.4467608964656905e-05,
+      "loss": 0.5828,
+      "step": 3718
+    },
+    {
+      "epoch": 0.6611555555555556,
+      "grad_norm": 0.3591357295312584,
+      "learning_rate": 5.441635109812504e-05,
+      "loss": 0.6097,
+      "step": 3719
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.34741984473289583,
+      "learning_rate": 5.436510834492072e-05,
+      "loss": 0.5938,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6615111111111112,
+      "grad_norm": 0.34992349716413,
+      "learning_rate": 5.431388072203373e-05,
+      "loss": 0.5635,
+      "step": 3721
+    },
+    {
+      "epoch": 0.6616888888888889,
+      "grad_norm": 0.3418413646637175,
+      "learning_rate": 5.4262668246448475e-05,
+      "loss": 0.5847,
+      "step": 3722
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.36975539394784773,
+      "learning_rate": 5.4211470935144715e-05,
+      "loss": 0.5424,
+      "step": 3723
+    },
+    {
+      "epoch": 0.6620444444444444,
+      "grad_norm": 0.33751654601563225,
+      "learning_rate": 5.4160288805096845e-05,
+      "loss": 0.599,
+      "step": 3724
+    },
+    {
+      "epoch": 0.6622222222222223,
+      "grad_norm": 0.3431978875411624,
+      "learning_rate": 5.410912187327446e-05,
+      "loss": 0.5164,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.37189954351334,
+      "learning_rate": 5.4057970156641994e-05,
+      "loss": 0.6025,
+      "step": 3726
+    },
+    {
+      "epoch": 0.6625777777777778,
+      "grad_norm": 0.32721099794335723,
+      "learning_rate": 5.4006833672158885e-05,
+      "loss": 0.5299,
+      "step": 3727
+    },
+    {
+      "epoch": 0.6627555555555555,
+      "grad_norm": 0.368190475024461,
+      "learning_rate": 5.3955712436779534e-05,
+      "loss": 0.5913,
+      "step": 3728
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.3407316840208625,
+      "learning_rate": 5.3904606467453254e-05,
+      "loss": 0.5816,
+      "step": 3729
+    },
+    {
+      "epoch": 0.6631111111111111,
+      "grad_norm": 0.3549760611940972,
+      "learning_rate": 5.385351578112429e-05,
+      "loss": 0.605,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6632888888888889,
+      "grad_norm": 0.3396666114160034,
+      "learning_rate": 5.380244039473184e-05,
+      "loss": 0.5629,
+      "step": 3731
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.34588927208694353,
+      "learning_rate": 5.375138032521004e-05,
+      "loss": 0.586,
+      "step": 3732
+    },
+    {
+      "epoch": 0.6636444444444445,
+      "grad_norm": 0.364545977171092,
+      "learning_rate": 5.3700335589487925e-05,
+      "loss": 0.5552,
+      "step": 3733
+    },
+    {
+      "epoch": 0.6638222222222222,
+      "grad_norm": 0.3376695797701531,
+      "learning_rate": 5.364930620448946e-05,
+      "loss": 0.5871,
+      "step": 3734
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3670168776506038,
+      "learning_rate": 5.35982921871335e-05,
+      "loss": 0.6,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6641777777777778,
+      "grad_norm": 0.36599522643191545,
+      "learning_rate": 5.354729355433383e-05,
+      "loss": 0.6191,
+      "step": 3736
+    },
+    {
+      "epoch": 0.6643555555555556,
+      "grad_norm": 0.35783214765567045,
+      "learning_rate": 5.3496310322999134e-05,
+      "loss": 0.6039,
+      "step": 3737
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.40812885957732903,
+      "learning_rate": 5.344534251003296e-05,
+      "loss": 0.5612,
+      "step": 3738
+    },
+    {
+      "epoch": 0.6647111111111111,
+      "grad_norm": 0.35493173709646303,
+      "learning_rate": 5.3394390132333805e-05,
+      "loss": 0.5777,
+      "step": 3739
+    },
+    {
+      "epoch": 0.6648888888888889,
+      "grad_norm": 0.346816382573614,
+      "learning_rate": 5.33434532067949e-05,
+      "loss": 0.5815,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.3428371997888651,
+      "learning_rate": 5.329253175030462e-05,
+      "loss": 0.5586,
+      "step": 3741
+    },
+    {
+      "epoch": 0.6652444444444444,
+      "grad_norm": 0.3203381640603382,
+      "learning_rate": 5.3241625779745873e-05,
+      "loss": 0.5272,
+      "step": 3742
+    },
+    {
+      "epoch": 0.6654222222222222,
+      "grad_norm": 0.35403288824240003,
+      "learning_rate": 5.319073531199679e-05,
+      "loss": 0.5714,
+      "step": 3743
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.39248273989205185,
+      "learning_rate": 5.3139860363929996e-05,
+      "loss": 0.6162,
+      "step": 3744
+    },
+    {
+      "epoch": 0.6657777777777778,
+      "grad_norm": 0.4082558760683123,
+      "learning_rate": 5.3089000952413346e-05,
+      "loss": 0.6,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6659555555555555,
+      "grad_norm": 0.33500367072715787,
+      "learning_rate": 5.303815709430918e-05,
+      "loss": 0.5613,
+      "step": 3746
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.3412212071133008,
+      "learning_rate": 5.298732880647502e-05,
+      "loss": 0.5969,
+      "step": 3747
+    },
+    {
+      "epoch": 0.6663111111111111,
+      "grad_norm": 0.3502787660500914,
+      "learning_rate": 5.29365161057629e-05,
+      "loss": 0.5725,
+      "step": 3748
+    },
+    {
+      "epoch": 0.6664888888888889,
+      "grad_norm": 0.3417881819264822,
+      "learning_rate": 5.2885719009020006e-05,
+      "loss": 0.572,
+      "step": 3749
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.4092137058679559,
+      "learning_rate": 5.283493753308808e-05,
+      "loss": 0.5675,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6668444444444445,
+      "grad_norm": 0.35247479931877196,
+      "learning_rate": 5.278417169480383e-05,
+      "loss": 0.5846,
+      "step": 3751
+    },
+    {
+      "epoch": 0.6670222222222222,
+      "grad_norm": 0.3792257183861252,
+      "learning_rate": 5.273342151099874e-05,
+      "loss": 0.5847,
+      "step": 3752
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3239661765215962,
+      "learning_rate": 5.268268699849912e-05,
+      "loss": 0.5244,
+      "step": 3753
+    },
+    {
+      "epoch": 0.6673777777777777,
+      "grad_norm": 0.3553481753323244,
+      "learning_rate": 5.263196817412608e-05,
+      "loss": 0.6102,
+      "step": 3754
+    },
+    {
+      "epoch": 0.6675555555555556,
+      "grad_norm": 0.3450954912095572,
+      "learning_rate": 5.2581265054695494e-05,
+      "loss": 0.5829,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.3296186380513435,
+      "learning_rate": 5.25305776570181e-05,
+      "loss": 0.583,
+      "step": 3756
+    },
+    {
+      "epoch": 0.6679111111111111,
+      "grad_norm": 0.37008182550415547,
+      "learning_rate": 5.247990599789935e-05,
+      "loss": 0.6049,
+      "step": 3757
+    },
+    {
+      "epoch": 0.6680888888888888,
+      "grad_norm": 0.40823817708212623,
+      "learning_rate": 5.2429250094139526e-05,
+      "loss": 0.5689,
+      "step": 3758
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.3812022744836966,
+      "learning_rate": 5.237860996253365e-05,
+      "loss": 0.6231,
+      "step": 3759
+    },
+    {
+      "epoch": 0.6684444444444444,
+      "grad_norm": 0.3704249487965545,
+      "learning_rate": 5.2327985619871555e-05,
+      "loss": 0.5828,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6686222222222222,
+      "grad_norm": 0.3422141445527099,
+      "learning_rate": 5.2277377082937806e-05,
+      "loss": 0.5916,
+      "step": 3761
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3685350400713173,
+      "learning_rate": 5.2226784368511735e-05,
+      "loss": 0.6045,
+      "step": 3762
+    },
+    {
+      "epoch": 0.6689777777777778,
+      "grad_norm": 0.367866697588512,
+      "learning_rate": 5.217620749336745e-05,
+      "loss": 0.5773,
+      "step": 3763
+    },
+    {
+      "epoch": 0.6691555555555555,
+      "grad_norm": 0.3587826585206924,
+      "learning_rate": 5.2125646474273785e-05,
+      "loss": 0.6054,
+      "step": 3764
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.37339954378788254,
+      "learning_rate": 5.207510132799436e-05,
+      "loss": 0.6345,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6695111111111111,
+      "grad_norm": 0.3451301432072929,
+      "learning_rate": 5.202457207128736e-05,
+      "loss": 0.5562,
+      "step": 3766
+    },
+    {
+      "epoch": 0.6696888888888889,
+      "grad_norm": 0.42835158085976643,
+      "learning_rate": 5.1974058720906014e-05,
+      "loss": 0.5966,
+      "step": 3767
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.3423620134896019,
+      "learning_rate": 5.192356129359794e-05,
+      "loss": 0.5154,
+      "step": 3768
+    },
+    {
+      "epoch": 0.6700444444444444,
+      "grad_norm": 0.35299452449965724,
+      "learning_rate": 5.1873079806105785e-05,
+      "loss": 0.5207,
+      "step": 3769
+    },
+    {
+      "epoch": 0.6702222222222223,
+      "grad_norm": 0.3613850679716077,
+      "learning_rate": 5.1822614275166614e-05,
+      "loss": 0.6096,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.3642423032154287,
+      "learning_rate": 5.17721647175125e-05,
+      "loss": 0.6047,
+      "step": 3771
+    },
+    {
+      "epoch": 0.6705777777777778,
+      "grad_norm": 0.36754967202922495,
+      "learning_rate": 5.1721731149869925e-05,
+      "loss": 0.5757,
+      "step": 3772
+    },
+    {
+      "epoch": 0.6707555555555555,
+      "grad_norm": 0.3556164799296794,
+      "learning_rate": 5.1671313588960355e-05,
+      "loss": 0.5818,
+      "step": 3773
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.44022696041050036,
+      "learning_rate": 5.16209120514997e-05,
+      "loss": 0.5449,
+      "step": 3774
+    },
+    {
+      "epoch": 0.6711111111111111,
+      "grad_norm": 0.34366825152326763,
+      "learning_rate": 5.1570526554198704e-05,
+      "loss": 0.5346,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6712888888888889,
+      "grad_norm": 0.3429518023848944,
+      "learning_rate": 5.152015711376274e-05,
+      "loss": 0.6139,
+      "step": 3776
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.36547213032738785,
+      "learning_rate": 5.146980374689192e-05,
+      "loss": 0.6601,
+      "step": 3777
+    },
+    {
+      "epoch": 0.6716444444444445,
+      "grad_norm": 0.3460266225059201,
+      "learning_rate": 5.141946647028092e-05,
+      "loss": 0.5377,
+      "step": 3778
+    },
+    {
+      "epoch": 0.6718222222222222,
+      "grad_norm": 0.35100353735664414,
+      "learning_rate": 5.136914530061917e-05,
+      "loss": 0.5794,
+      "step": 3779
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.36933101402424406,
+      "learning_rate": 5.1318840254590725e-05,
+      "loss": 0.5673,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6721777777777778,
+      "grad_norm": 0.35799785565842296,
+      "learning_rate": 5.1268551348874296e-05,
+      "loss": 0.5994,
+      "step": 3781
+    },
+    {
+      "epoch": 0.6723555555555556,
+      "grad_norm": 0.34980612555669427,
+      "learning_rate": 5.121827860014326e-05,
+      "loss": 0.596,
+      "step": 3782
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.3869567562483847,
+      "learning_rate": 5.11680220250656e-05,
+      "loss": 0.5737,
+      "step": 3783
+    },
+    {
+      "epoch": 0.6727111111111111,
+      "grad_norm": 0.40077405619198875,
+      "learning_rate": 5.111778164030396e-05,
+      "loss": 0.6231,
+      "step": 3784
+    },
+    {
+      "epoch": 0.6728888888888889,
+      "grad_norm": 0.33628240154867695,
+      "learning_rate": 5.106755746251565e-05,
+      "loss": 0.5866,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.38035745082408773,
+      "learning_rate": 5.101734950835253e-05,
+      "loss": 0.5619,
+      "step": 3786
+    },
+    {
+      "epoch": 0.6732444444444444,
+      "grad_norm": 0.36311862093378333,
+      "learning_rate": 5.0967157794461154e-05,
+      "loss": 0.6134,
+      "step": 3787
+    },
+    {
+      "epoch": 0.6734222222222223,
+      "grad_norm": 0.3421953430124754,
+      "learning_rate": 5.0916982337482644e-05,
+      "loss": 0.535,
+      "step": 3788
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.35218075396362114,
+      "learning_rate": 5.086682315405279e-05,
+      "loss": 0.6128,
+      "step": 3789
+    },
+    {
+      "epoch": 0.6737777777777778,
+      "grad_norm": 0.39903409978381105,
+      "learning_rate": 5.081668026080183e-05,
+      "loss": 0.6067,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6739555555555555,
+      "grad_norm": 0.3435573961476874,
+      "learning_rate": 5.076655367435487e-05,
+      "loss": 0.523,
+      "step": 3791
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.3734677873865915,
+      "learning_rate": 5.071644341133131e-05,
+      "loss": 0.5682,
+      "step": 3792
+    },
+    {
+      "epoch": 0.6743111111111111,
+      "grad_norm": 0.32798848351480425,
+      "learning_rate": 5.066634948834541e-05,
+      "loss": 0.5562,
+      "step": 3793
+    },
+    {
+      "epoch": 0.6744888888888889,
+      "grad_norm": 0.34973305116801057,
+      "learning_rate": 5.061627192200575e-05,
+      "loss": 0.6369,
+      "step": 3794
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.39554832346531404,
+      "learning_rate": 5.0566210728915786e-05,
+      "loss": 0.5606,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6748444444444445,
+      "grad_norm": 0.3503082660038528,
+      "learning_rate": 5.051616592567323e-05,
+      "loss": 0.58,
+      "step": 3796
+    },
+    {
+      "epoch": 0.6750222222222222,
+      "grad_norm": 0.5505857665869399,
+      "learning_rate": 5.046613752887064e-05,
+      "loss": 0.5632,
+      "step": 3797
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.32473678902894215,
+      "learning_rate": 5.041612555509492e-05,
+      "loss": 0.5579,
+      "step": 3798
+    },
+    {
+      "epoch": 0.6753777777777777,
+      "grad_norm": 0.35967215565695027,
+      "learning_rate": 5.0366130020927624e-05,
+      "loss": 0.6195,
+      "step": 3799
+    },
+    {
+      "epoch": 0.6755555555555556,
+      "grad_norm": 0.3631887712577227,
+      "learning_rate": 5.031615094294488e-05,
+      "loss": 0.6026,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.3459963302028815,
+      "learning_rate": 5.02661883377173e-05,
+      "loss": 0.5307,
+      "step": 3801
+    },
+    {
+      "epoch": 0.6759111111111111,
+      "grad_norm": 0.3828382104822761,
+      "learning_rate": 5.0216242221810075e-05,
+      "loss": 0.6169,
+      "step": 3802
+    },
+    {
+      "epoch": 0.6760888888888889,
+      "grad_norm": 0.379709160745068,
+      "learning_rate": 5.0166312611782916e-05,
+      "loss": 0.5668,
+      "step": 3803
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.34704142397707294,
+      "learning_rate": 5.011639952419005e-05,
+      "loss": 0.5558,
+      "step": 3804
+    },
+    {
+      "epoch": 0.6764444444444444,
+      "grad_norm": 0.36262162029044515,
+      "learning_rate": 5.0066502975580244e-05,
+      "loss": 0.5498,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6766222222222222,
+      "grad_norm": 0.38016595918740326,
+      "learning_rate": 5.001662298249678e-05,
+      "loss": 0.618,
+      "step": 3806
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.35110390690577636,
+      "learning_rate": 4.9966759561477424e-05,
+      "loss": 0.5545,
+      "step": 3807
+    },
+    {
+      "epoch": 0.6769777777777778,
+      "grad_norm": 0.3617674763207987,
+      "learning_rate": 4.991691272905449e-05,
+      "loss": 0.6053,
+      "step": 3808
+    },
+    {
+      "epoch": 0.6771555555555555,
+      "grad_norm": 0.34578344893770513,
+      "learning_rate": 4.986708250175476e-05,
+      "loss": 0.574,
+      "step": 3809
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.34567593734131025,
+      "learning_rate": 4.981726889609952e-05,
+      "loss": 0.5964,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6775111111111111,
+      "grad_norm": 0.36819427764691953,
+      "learning_rate": 4.976747192860456e-05,
+      "loss": 0.5875,
+      "step": 3811
+    },
+    {
+      "epoch": 0.6776888888888889,
+      "grad_norm": 0.34110912747908734,
+      "learning_rate": 4.971769161578013e-05,
+      "loss": 0.5572,
+      "step": 3812
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.35040782541782817,
+      "learning_rate": 4.9667927974131e-05,
+      "loss": 0.5744,
+      "step": 3813
+    },
+    {
+      "epoch": 0.6780444444444444,
+      "grad_norm": 0.3265403757922863,
+      "learning_rate": 4.9618181020156274e-05,
+      "loss": 0.5501,
+      "step": 3814
+    },
+    {
+      "epoch": 0.6782222222222222,
+      "grad_norm": 0.35891825557525664,
+      "learning_rate": 4.9568450770349775e-05,
+      "loss": 0.6172,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.34750698466013846,
+      "learning_rate": 4.9518737241199495e-05,
+      "loss": 0.5723,
+      "step": 3816
+    },
+    {
+      "epoch": 0.6785777777777777,
+      "grad_norm": 0.369180027545402,
+      "learning_rate": 4.9469040449188185e-05,
+      "loss": 0.5641,
+      "step": 3817
+    },
+    {
+      "epoch": 0.6787555555555556,
+      "grad_norm": 0.37708735468285687,
+      "learning_rate": 4.9419360410792745e-05,
+      "loss": 0.5615,
+      "step": 3818
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.4431329040018839,
+      "learning_rate": 4.936969714248481e-05,
+      "loss": 0.6027,
+      "step": 3819
+    },
+    {
+      "epoch": 0.6791111111111111,
+      "grad_norm": 0.3575795416823746,
+      "learning_rate": 4.932005066073014e-05,
+      "loss": 0.6085,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6792888888888889,
+      "grad_norm": 0.3424091399815592,
+      "learning_rate": 4.9270420981989294e-05,
+      "loss": 0.5758,
+      "step": 3821
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.3810216072083059,
+      "learning_rate": 4.9220808122716924e-05,
+      "loss": 0.5942,
+      "step": 3822
+    },
+    {
+      "epoch": 0.6796444444444445,
+      "grad_norm": 0.37531728510793283,
+      "learning_rate": 4.91712120993623e-05,
+      "loss": 0.5483,
+      "step": 3823
+    },
+    {
+      "epoch": 0.6798222222222222,
+      "grad_norm": 0.3517852213212341,
+      "learning_rate": 4.912163292836903e-05,
+      "loss": 0.582,
+      "step": 3824
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.36090366339817603,
+      "learning_rate": 4.9072070626175203e-05,
+      "loss": 0.5374,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6801777777777778,
+      "grad_norm": 0.3478217412734873,
+      "learning_rate": 4.9022525209213264e-05,
+      "loss": 0.5559,
+      "step": 3826
+    },
+    {
+      "epoch": 0.6803555555555556,
+      "grad_norm": 0.34398676563975056,
+      "learning_rate": 4.8972996693910054e-05,
+      "loss": 0.5355,
+      "step": 3827
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.37694958802860384,
+      "learning_rate": 4.892348509668684e-05,
+      "loss": 0.5887,
+      "step": 3828
+    },
+    {
+      "epoch": 0.6807111111111112,
+      "grad_norm": 0.37462673336384417,
+      "learning_rate": 4.887399043395927e-05,
+      "loss": 0.6418,
+      "step": 3829
+    },
+    {
+      "epoch": 0.6808888888888889,
+      "grad_norm": 0.3384279319810809,
+      "learning_rate": 4.882451272213736e-05,
+      "loss": 0.5602,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.3532464607557794,
+      "learning_rate": 4.877505197762553e-05,
+      "loss": 0.5492,
+      "step": 3831
+    },
+    {
+      "epoch": 0.6812444444444444,
+      "grad_norm": 0.35755721729900775,
+      "learning_rate": 4.872560821682256e-05,
+      "loss": 0.5708,
+      "step": 3832
+    },
+    {
+      "epoch": 0.6814222222222223,
+      "grad_norm": 0.3786332812544826,
+      "learning_rate": 4.867618145612162e-05,
+      "loss": 0.565,
+      "step": 3833
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3385648540747402,
+      "learning_rate": 4.86267717119102e-05,
+      "loss": 0.5839,
+      "step": 3834
+    },
+    {
+      "epoch": 0.6817777777777778,
+      "grad_norm": 0.37030014327537,
+      "learning_rate": 4.85773790005702e-05,
+      "loss": 0.6042,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6819555555555555,
+      "grad_norm": 0.34660154424499623,
+      "learning_rate": 4.8528003338477823e-05,
+      "loss": 0.5794,
+      "step": 3836
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.3633376712867026,
+      "learning_rate": 4.847864474200371e-05,
+      "loss": 0.5519,
+      "step": 3837
+    },
+    {
+      "epoch": 0.6823111111111111,
+      "grad_norm": 0.35660589921265107,
+      "learning_rate": 4.8429303227512645e-05,
+      "loss": 0.5858,
+      "step": 3838
+    },
+    {
+      "epoch": 0.6824888888888889,
+      "grad_norm": 0.3393857504319453,
+      "learning_rate": 4.837997881136404e-05,
+      "loss": 0.57,
+      "step": 3839
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.3301556350550329,
+      "learning_rate": 4.833067150991133e-05,
+      "loss": 0.5301,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6828444444444445,
+      "grad_norm": 0.36474614368015157,
+      "learning_rate": 4.8281381339502565e-05,
+      "loss": 0.5639,
+      "step": 3841
+    },
+    {
+      "epoch": 0.6830222222222222,
+      "grad_norm": 0.3401723902768643,
+      "learning_rate": 4.823210831647984e-05,
+      "loss": 0.5693,
+      "step": 3842
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.36054270662523535,
+      "learning_rate": 4.818285245717984e-05,
+      "loss": 0.5973,
+      "step": 3843
+    },
+    {
+      "epoch": 0.6833777777777778,
+      "grad_norm": 0.3694129586838312,
+      "learning_rate": 4.813361377793327e-05,
+      "loss": 0.5933,
+      "step": 3844
+    },
+    {
+      "epoch": 0.6835555555555556,
+      "grad_norm": 0.34642722604450854,
+      "learning_rate": 4.808439229506546e-05,
+      "loss": 0.6042,
+      "step": 3845
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.36260310020770314,
+      "learning_rate": 4.8035188024895685e-05,
+      "loss": 0.5615,
+      "step": 3846
+    },
+    {
+      "epoch": 0.6839111111111111,
+      "grad_norm": 0.35158156525650475,
+      "learning_rate": 4.7986000983737856e-05,
+      "loss": 0.6045,
+      "step": 3847
+    },
+    {
+      "epoch": 0.6840888888888889,
+      "grad_norm": 0.3440241194751509,
+      "learning_rate": 4.793683118789991e-05,
+      "loss": 0.4972,
+      "step": 3848
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.3323429868860486,
+      "learning_rate": 4.7887678653684184e-05,
+      "loss": 0.5597,
+      "step": 3849
+    },
+    {
+      "epoch": 0.6844444444444444,
+      "grad_norm": 0.3521174206342374,
+      "learning_rate": 4.783854339738729e-05,
+      "loss": 0.5585,
+      "step": 3850
+    },
+    {
+      "epoch": 0.6846222222222222,
+      "grad_norm": 0.3647623603677191,
+      "learning_rate": 4.7789425435300107e-05,
+      "loss": 0.599,
+      "step": 3851
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.34949764487443796,
+      "learning_rate": 4.7740324783707734e-05,
+      "loss": 0.5994,
+      "step": 3852
+    },
+    {
+      "epoch": 0.6849777777777778,
+      "grad_norm": 0.3540874873670466,
+      "learning_rate": 4.76912414588896e-05,
+      "loss": 0.5629,
+      "step": 3853
+    },
+    {
+      "epoch": 0.6851555555555555,
+      "grad_norm": 0.36175317590547346,
+      "learning_rate": 4.764217547711934e-05,
+      "loss": 0.6116,
+      "step": 3854
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.35043526832572747,
+      "learning_rate": 4.759312685466486e-05,
+      "loss": 0.5409,
+      "step": 3855
+    },
+    {
+      "epoch": 0.6855111111111111,
+      "grad_norm": 0.34570341800457427,
+      "learning_rate": 4.75440956077883e-05,
+      "loss": 0.6085,
+      "step": 3856
+    },
+    {
+      "epoch": 0.6856888888888889,
+      "grad_norm": 0.3660548286797606,
+      "learning_rate": 4.749508175274605e-05,
+      "loss": 0.5927,
+      "step": 3857
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.3649511403309351,
+      "learning_rate": 4.7446085305788725e-05,
+      "loss": 0.5539,
+      "step": 3858
+    },
+    {
+      "epoch": 0.6860444444444445,
+      "grad_norm": 0.3534702171556326,
+      "learning_rate": 4.7397106283161166e-05,
+      "loss": 0.5525,
+      "step": 3859
+    },
+    {
+      "epoch": 0.6862222222222222,
+      "grad_norm": 0.34427844562878507,
+      "learning_rate": 4.734814470110244e-05,
+      "loss": 0.5718,
+      "step": 3860
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.37661405409639365,
+      "learning_rate": 4.729920057584584e-05,
+      "loss": 0.5797,
+      "step": 3861
+    },
+    {
+      "epoch": 0.6865777777777777,
+      "grad_norm": 0.3606437950815262,
+      "learning_rate": 4.725027392361887e-05,
+      "loss": 0.573,
+      "step": 3862
+    },
+    {
+      "epoch": 0.6867555555555556,
+      "grad_norm": 0.35634022099202656,
+      "learning_rate": 4.7201364760643264e-05,
+      "loss": 0.5706,
+      "step": 3863
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.3534861640952797,
+      "learning_rate": 4.715247310313482e-05,
+      "loss": 0.5837,
+      "step": 3864
+    },
+    {
+      "epoch": 0.6871111111111111,
+      "grad_norm": 0.3553531452104539,
+      "learning_rate": 4.710359896730379e-05,
+      "loss": 0.5756,
+      "step": 3865
+    },
+    {
+      "epoch": 0.6872888888888888,
+      "grad_norm": 0.3823974292421038,
+      "learning_rate": 4.7054742369354324e-05,
+      "loss": 0.5861,
+      "step": 3866
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.3760494487706127,
+      "learning_rate": 4.700590332548503e-05,
+      "loss": 0.5547,
+      "step": 3867
+    },
+    {
+      "epoch": 0.6876444444444444,
+      "grad_norm": 0.371549022417296,
+      "learning_rate": 4.695708185188844e-05,
+      "loss": 0.5946,
+      "step": 3868
+    },
+    {
+      "epoch": 0.6878222222222222,
+      "grad_norm": 0.3615090859357574,
+      "learning_rate": 4.690827796475152e-05,
+      "loss": 0.5449,
+      "step": 3869
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.35782069921441423,
+      "learning_rate": 4.685949168025514e-05,
+      "loss": 0.6002,
+      "step": 3870
+    },
+    {
+      "epoch": 0.6881777777777778,
+      "grad_norm": 0.36235867684088197,
+      "learning_rate": 4.681072301457461e-05,
+      "loss": 0.5379,
+      "step": 3871
+    },
+    {
+      "epoch": 0.6883555555555556,
+      "grad_norm": 0.35585365070286906,
+      "learning_rate": 4.676197198387913e-05,
+      "loss": 0.6212,
+      "step": 3872
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.37002129824452296,
+      "learning_rate": 4.671323860433222e-05,
+      "loss": 0.6297,
+      "step": 3873
+    },
+    {
+      "epoch": 0.6887111111111112,
+      "grad_norm": 0.3633301309645365,
+      "learning_rate": 4.666452289209152e-05,
+      "loss": 0.5547,
+      "step": 3874
+    },
+    {
+      "epoch": 0.6888888888888889,
+      "grad_norm": 0.3534253713893809,
+      "learning_rate": 4.661582486330879e-05,
+      "loss": 0.5618,
+      "step": 3875
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.3593716800161438,
+      "learning_rate": 4.656714453412993e-05,
+      "loss": 0.5702,
+      "step": 3876
+    },
+    {
+      "epoch": 0.6892444444444444,
+      "grad_norm": 0.356579718553429,
+      "learning_rate": 4.651848192069498e-05,
+      "loss": 0.5589,
+      "step": 3877
+    },
+    {
+      "epoch": 0.6894222222222223,
+      "grad_norm": 0.3491130956442169,
+      "learning_rate": 4.64698370391381e-05,
+      "loss": 0.6025,
+      "step": 3878
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.35685189397008116,
+      "learning_rate": 4.642120990558758e-05,
+      "loss": 0.5465,
+      "step": 3879
+    },
+    {
+      "epoch": 0.6897777777777778,
+      "grad_norm": 0.34426661670923675,
+      "learning_rate": 4.637260053616581e-05,
+      "loss": 0.5827,
+      "step": 3880
+    },
+    {
+      "epoch": 0.6899555555555555,
+      "grad_norm": 0.3515706163716907,
+      "learning_rate": 4.6324008946989314e-05,
+      "loss": 0.5887,
+      "step": 3881
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.35541335364086346,
+      "learning_rate": 4.62754351541687e-05,
+      "loss": 0.5735,
+      "step": 3882
+    },
+    {
+      "epoch": 0.6903111111111111,
+      "grad_norm": 0.37546258532550886,
+      "learning_rate": 4.622687917380868e-05,
+      "loss": 0.5746,
+      "step": 3883
+    },
+    {
+      "epoch": 0.6904888888888889,
+      "grad_norm": 0.34909350657030414,
+      "learning_rate": 4.6178341022008054e-05,
+      "loss": 0.5477,
+      "step": 3884
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.35697970341341345,
+      "learning_rate": 4.612982071485974e-05,
+      "loss": 0.5839,
+      "step": 3885
+    },
+    {
+      "epoch": 0.6908444444444445,
+      "grad_norm": 0.34245040410489874,
+      "learning_rate": 4.60813182684507e-05,
+      "loss": 0.576,
+      "step": 3886
+    },
+    {
+      "epoch": 0.6910222222222222,
+      "grad_norm": 0.35896187582512795,
+      "learning_rate": 4.6032833698862044e-05,
+      "loss": 0.5917,
+      "step": 3887
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3479823427169532,
+      "learning_rate": 4.5984367022168786e-05,
+      "loss": 0.6126,
+      "step": 3888
+    },
+    {
+      "epoch": 0.6913777777777778,
+      "grad_norm": 0.36654176836804997,
+      "learning_rate": 4.593591825444028e-05,
+      "loss": 0.6102,
+      "step": 3889
+    },
+    {
+      "epoch": 0.6915555555555556,
+      "grad_norm": 0.3496069876103817,
+      "learning_rate": 4.588748741173963e-05,
+      "loss": 0.5949,
+      "step": 3890
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.3504383001973791,
+      "learning_rate": 4.5839074510124314e-05,
+      "loss": 0.5858,
+      "step": 3891
+    },
+    {
+      "epoch": 0.6919111111111111,
+      "grad_norm": 0.33586675396245075,
+      "learning_rate": 4.5790679565645544e-05,
+      "loss": 0.5928,
+      "step": 3892
+    },
+    {
+      "epoch": 0.6920888888888889,
+      "grad_norm": 0.36542453642785044,
+      "learning_rate": 4.5742302594348894e-05,
+      "loss": 0.6056,
+      "step": 3893
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.36214877658285244,
+      "learning_rate": 4.569394361227367e-05,
+      "loss": 0.5848,
+      "step": 3894
+    },
+    {
+      "epoch": 0.6924444444444444,
+      "grad_norm": 0.36339216006050884,
+      "learning_rate": 4.564560263545351e-05,
+      "loss": 0.6122,
+      "step": 3895
+    },
+    {
+      "epoch": 0.6926222222222223,
+      "grad_norm": 0.3591363101456611,
+      "learning_rate": 4.559727967991584e-05,
+      "loss": 0.6134,
+      "step": 3896
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3425321461841997,
+      "learning_rate": 4.554897476168223e-05,
+      "loss": 0.6014,
+      "step": 3897
+    },
+    {
+      "epoch": 0.6929777777777778,
+      "grad_norm": 0.34664678541799615,
+      "learning_rate": 4.5500687896768256e-05,
+      "loss": 0.5988,
+      "step": 3898
+    },
+    {
+      "epoch": 0.6931555555555555,
+      "grad_norm": 0.34113760838225743,
+      "learning_rate": 4.54524191011835e-05,
+      "loss": 0.5651,
+      "step": 3899
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.35157119209602583,
+      "learning_rate": 4.540416839093156e-05,
+      "loss": 0.5755,
+      "step": 3900
+    },
+    {
+      "epoch": 0.6935111111111111,
+      "grad_norm": 0.44879573349370927,
+      "learning_rate": 4.5355935782010015e-05,
+      "loss": 0.5823,
+      "step": 3901
+    },
+    {
+      "epoch": 0.6936888888888889,
+      "grad_norm": 0.3615969911867251,
+      "learning_rate": 4.5307721290410475e-05,
+      "loss": 0.5743,
+      "step": 3902
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.3869661107969985,
+      "learning_rate": 4.5259524932118526e-05,
+      "loss": 0.579,
+      "step": 3903
+    },
+    {
+      "epoch": 0.6940444444444445,
+      "grad_norm": 0.3722069911520766,
+      "learning_rate": 4.521134672311373e-05,
+      "loss": 0.5587,
+      "step": 3904
+    },
+    {
+      "epoch": 0.6942222222222222,
+      "grad_norm": 0.34651584775074395,
+      "learning_rate": 4.516318667936967e-05,
+      "loss": 0.5768,
+      "step": 3905
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3624580281710354,
+      "learning_rate": 4.511504481685386e-05,
+      "loss": 0.5995,
+      "step": 3906
+    },
+    {
+      "epoch": 0.6945777777777777,
+      "grad_norm": 0.354355033707949,
+      "learning_rate": 4.5066921151527816e-05,
+      "loss": 0.588,
+      "step": 3907
+    },
+    {
+      "epoch": 0.6947555555555556,
+      "grad_norm": 0.3511116105009167,
+      "learning_rate": 4.5018815699347004e-05,
+      "loss": 0.577,
+      "step": 3908
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.47298146572786093,
+      "learning_rate": 4.497072847626087e-05,
+      "loss": 0.6052,
+      "step": 3909
+    },
+    {
+      "epoch": 0.6951111111111111,
+      "grad_norm": 0.35753521927971,
+      "learning_rate": 4.4922659498212796e-05,
+      "loss": 0.5675,
+      "step": 3910
+    },
+    {
+      "epoch": 0.6952888888888888,
+      "grad_norm": 0.36180816973757196,
+      "learning_rate": 4.487460878114017e-05,
+      "loss": 0.5889,
+      "step": 3911
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.36529679553091143,
+      "learning_rate": 4.482657634097416e-05,
+      "loss": 0.5563,
+      "step": 3912
+    },
+    {
+      "epoch": 0.6956444444444444,
+      "grad_norm": 0.3435383267586004,
+      "learning_rate": 4.477856219364015e-05,
+      "loss": 0.584,
+      "step": 3913
+    },
+    {
+      "epoch": 0.6958222222222222,
+      "grad_norm": 0.36156980011748857,
+      "learning_rate": 4.4730566355057145e-05,
+      "loss": 0.6063,
+      "step": 3914
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3649794175465803,
+      "learning_rate": 4.4682588841138396e-05,
+      "loss": 0.5851,
+      "step": 3915
+    },
+    {
+      "epoch": 0.6961777777777778,
+      "grad_norm": 0.3438709227117961,
+      "learning_rate": 4.4634629667790774e-05,
+      "loss": 0.5718,
+      "step": 3916
+    },
+    {
+      "epoch": 0.6963555555555555,
+      "grad_norm": 0.34980101432518595,
+      "learning_rate": 4.4586688850915345e-05,
+      "loss": 0.5796,
+      "step": 3917
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.42776970744186266,
+      "learning_rate": 4.453876640640684e-05,
+      "loss": 0.6401,
+      "step": 3918
+    },
+    {
+      "epoch": 0.6967111111111111,
+      "grad_norm": 0.3463354402996292,
+      "learning_rate": 4.449086235015414e-05,
+      "loss": 0.5768,
+      "step": 3919
+    },
+    {
+      "epoch": 0.6968888888888889,
+      "grad_norm": 0.34704208506945794,
+      "learning_rate": 4.444297669803981e-05,
+      "loss": 0.6196,
+      "step": 3920
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.3653213905469017,
+      "learning_rate": 4.43951094659404e-05,
+      "loss": 0.5632,
+      "step": 3921
+    },
+    {
+      "epoch": 0.6972444444444444,
+      "grad_norm": 0.33422460568468454,
+      "learning_rate": 4.434726066972649e-05,
+      "loss": 0.5462,
+      "step": 3922
+    },
+    {
+      "epoch": 0.6974222222222223,
+      "grad_norm": 0.37515661882135604,
+      "learning_rate": 4.429943032526225e-05,
+      "loss": 0.5945,
+      "step": 3923
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.33298265892550527,
+      "learning_rate": 4.4251618448406073e-05,
+      "loss": 0.5896,
+      "step": 3924
+    },
+    {
+      "epoch": 0.6977777777777778,
+      "grad_norm": 0.34416117702092097,
+      "learning_rate": 4.42038250550099e-05,
+      "loss": 0.5701,
+      "step": 3925
+    },
+    {
+      "epoch": 0.6979555555555556,
+      "grad_norm": 0.34864441108067806,
+      "learning_rate": 4.415605016091985e-05,
+      "loss": 0.5772,
+      "step": 3926
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.32104961212294114,
+      "learning_rate": 4.410829378197562e-05,
+      "loss": 0.5817,
+      "step": 3927
+    },
+    {
+      "epoch": 0.6983111111111111,
+      "grad_norm": 0.34796075997418663,
+      "learning_rate": 4.406055593401104e-05,
+      "loss": 0.5426,
+      "step": 3928
+    },
+    {
+      "epoch": 0.6984888888888889,
+      "grad_norm": 0.3360833218633838,
+      "learning_rate": 4.401283663285355e-05,
+      "loss": 0.5557,
+      "step": 3929
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.35748969097689026,
+      "learning_rate": 4.396513589432467e-05,
+      "loss": 0.583,
+      "step": 3930
+    },
+    {
+      "epoch": 0.6988444444444445,
+      "grad_norm": 0.38150841739853314,
+      "learning_rate": 4.3917453734239566e-05,
+      "loss": 0.5926,
+      "step": 3931
+    },
+    {
+      "epoch": 0.6990222222222222,
+      "grad_norm": 0.38293614438182194,
+      "learning_rate": 4.386979016840735e-05,
+      "loss": 0.5796,
+      "step": 3932
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.3560881302825979,
+      "learning_rate": 4.3822145212630964e-05,
+      "loss": 0.5847,
+      "step": 3933
+    },
+    {
+      "epoch": 0.6993777777777778,
+      "grad_norm": 0.34797452041665655,
+      "learning_rate": 4.377451888270715e-05,
+      "loss": 0.5607,
+      "step": 3934
+    },
+    {
+      "epoch": 0.6995555555555556,
+      "grad_norm": 0.37016955208761415,
+      "learning_rate": 4.37269111944265e-05,
+      "loss": 0.5989,
+      "step": 3935
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.35275258352121736,
+      "learning_rate": 4.367932216357342e-05,
+      "loss": 0.5698,
+      "step": 3936
+    },
+    {
+      "epoch": 0.6999111111111112,
+      "grad_norm": 0.39192577891409336,
+      "learning_rate": 4.363175180592611e-05,
+      "loss": 0.6282,
+      "step": 3937
+    },
+    {
+      "epoch": 0.7000888888888889,
+      "grad_norm": 0.3978897026500953,
+      "learning_rate": 4.35842001372566e-05,
+      "loss": 0.557,
+      "step": 3938
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.40034566690856965,
+      "learning_rate": 4.3536667173330726e-05,
+      "loss": 0.5879,
+      "step": 3939
+    },
+    {
+      "epoch": 0.7004444444444444,
+      "grad_norm": 0.3303189964398963,
+      "learning_rate": 4.348915292990809e-05,
+      "loss": 0.5353,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7006222222222223,
+      "grad_norm": 0.3399606245816993,
+      "learning_rate": 4.344165742274215e-05,
+      "loss": 0.561,
+      "step": 3941
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3678808617997972,
+      "learning_rate": 4.339418066758008e-05,
+      "loss": 0.6164,
+      "step": 3942
+    },
+    {
+      "epoch": 0.7009777777777778,
+      "grad_norm": 0.3553190925477395,
+      "learning_rate": 4.334672268016288e-05,
+      "loss": 0.5794,
+      "step": 3943
+    },
+    {
+      "epoch": 0.7011555555555555,
+      "grad_norm": 0.338682014565456,
+      "learning_rate": 4.3299283476225315e-05,
+      "loss": 0.5305,
+      "step": 3944
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.35077909335599466,
+      "learning_rate": 4.325186307149593e-05,
+      "loss": 0.6154,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7015111111111111,
+      "grad_norm": 0.341559459947685,
+      "learning_rate": 4.320446148169707e-05,
+      "loss": 0.5657,
+      "step": 3946
+    },
+    {
+      "epoch": 0.7016888888888889,
+      "grad_norm": 0.34594671246955,
+      "learning_rate": 4.3157078722544685e-05,
+      "loss": 0.5702,
+      "step": 3947
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.34414227911143863,
+      "learning_rate": 4.310971480974875e-05,
+      "loss": 0.5798,
+      "step": 3948
+    },
+    {
+      "epoch": 0.7020444444444445,
+      "grad_norm": 0.3796641230876599,
+      "learning_rate": 4.30623697590127e-05,
+      "loss": 0.5896,
+      "step": 3949
+    },
+    {
+      "epoch": 0.7022222222222222,
+      "grad_norm": 0.3622368752336497,
+      "learning_rate": 4.301504358603401e-05,
+      "loss": 0.573,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.337223851700592,
+      "learning_rate": 4.296773630650358e-05,
+      "loss": 0.5236,
+      "step": 3951
+    },
+    {
+      "epoch": 0.7025777777777777,
+      "grad_norm": 0.3458866641583537,
+      "learning_rate": 4.292044793610637e-05,
+      "loss": 0.5643,
+      "step": 3952
+    },
+    {
+      "epoch": 0.7027555555555556,
+      "grad_norm": 0.34992414132112304,
+      "learning_rate": 4.287317849052075e-05,
+      "loss": 0.576,
+      "step": 3953
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.36262374160369426,
+      "learning_rate": 4.2825927985419144e-05,
+      "loss": 0.5958,
+      "step": 3954
+    },
+    {
+      "epoch": 0.7031111111111111,
+      "grad_norm": 0.35643758035536016,
+      "learning_rate": 4.2778696436467404e-05,
+      "loss": 0.5541,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7032888888888889,
+      "grad_norm": 0.38528338069749735,
+      "learning_rate": 4.273148385932526e-05,
+      "loss": 0.5869,
+      "step": 3956
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.3627013274430368,
+      "learning_rate": 4.268429026964611e-05,
+      "loss": 0.6232,
+      "step": 3957
+    },
+    {
+      "epoch": 0.7036444444444444,
+      "grad_norm": 0.3564755402380381,
+      "learning_rate": 4.263711568307707e-05,
+      "loss": 0.595,
+      "step": 3958
+    },
+    {
+      "epoch": 0.7038222222222222,
+      "grad_norm": 0.414100561482083,
+      "learning_rate": 4.258996011525893e-05,
+      "loss": 0.5607,
+      "step": 3959
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.34566607212314415,
+      "learning_rate": 4.2542823581826205e-05,
+      "loss": 0.593,
+      "step": 3960
+    },
+    {
+      "epoch": 0.7041777777777778,
+      "grad_norm": 0.35451448279944625,
+      "learning_rate": 4.2495706098407085e-05,
+      "loss": 0.588,
+      "step": 3961
+    },
+    {
+      "epoch": 0.7043555555555555,
+      "grad_norm": 0.36541964699174234,
+      "learning_rate": 4.244860768062343e-05,
+      "loss": 0.6064,
+      "step": 3962
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.32826248336139785,
+      "learning_rate": 4.2401528344090804e-05,
+      "loss": 0.5537,
+      "step": 3963
+    },
+    {
+      "epoch": 0.7047111111111111,
+      "grad_norm": 0.3385579201298541,
+      "learning_rate": 4.235446810441841e-05,
+      "loss": 0.5438,
+      "step": 3964
+    },
+    {
+      "epoch": 0.7048888888888889,
+      "grad_norm": 0.3766815200629706,
+      "learning_rate": 4.2307426977209164e-05,
+      "loss": 0.5692,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.35430660682312387,
+      "learning_rate": 4.226040497805962e-05,
+      "loss": 0.5467,
+      "step": 3966
+    },
+    {
+      "epoch": 0.7052444444444445,
+      "grad_norm": 0.3854060841328241,
+      "learning_rate": 4.2213402122559986e-05,
+      "loss": 0.5578,
+      "step": 3967
+    },
+    {
+      "epoch": 0.7054222222222222,
+      "grad_norm": 0.36729374311813695,
+      "learning_rate": 4.216641842629413e-05,
+      "loss": 0.5559,
+      "step": 3968
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3453875029217099,
+      "learning_rate": 4.2119453904839565e-05,
+      "loss": 0.553,
+      "step": 3969
+    },
+    {
+      "epoch": 0.7057777777777777,
+      "grad_norm": 0.3455780184007163,
+      "learning_rate": 4.20725085737675e-05,
+      "loss": 0.5616,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7059555555555556,
+      "grad_norm": 0.40194484512667955,
+      "learning_rate": 4.202558244864261e-05,
+      "loss": 0.565,
+      "step": 3971
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.38172898189925675,
+      "learning_rate": 4.197867554502347e-05,
+      "loss": 0.5999,
+      "step": 3972
+    },
+    {
+      "epoch": 0.7063111111111111,
+      "grad_norm": 0.4468107895110227,
+      "learning_rate": 4.193178787846198e-05,
+      "loss": 0.6152,
+      "step": 3973
+    },
+    {
+      "epoch": 0.7064888888888889,
+      "grad_norm": 0.3499827521892334,
+      "learning_rate": 4.188491946450398e-05,
+      "loss": 0.5905,
+      "step": 3974
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.349635214945576,
+      "learning_rate": 4.1838070318688604e-05,
+      "loss": 0.5308,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7068444444444445,
+      "grad_norm": 0.38503473700201507,
+      "learning_rate": 4.1791240456548905e-05,
+      "loss": 0.5635,
+      "step": 3976
+    },
+    {
+      "epoch": 0.7070222222222222,
+      "grad_norm": 0.33552138655392005,
+      "learning_rate": 4.174442989361126e-05,
+      "loss": 0.5559,
+      "step": 3977
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.36321069238589176,
+      "learning_rate": 4.169763864539591e-05,
+      "loss": 0.6015,
+      "step": 3978
+    },
+    {
+      "epoch": 0.7073777777777778,
+      "grad_norm": 0.36501739700508484,
+      "learning_rate": 4.165086672741647e-05,
+      "loss": 0.5872,
+      "step": 3979
+    },
+    {
+      "epoch": 0.7075555555555556,
+      "grad_norm": 0.35122587976977737,
+      "learning_rate": 4.160411415518026e-05,
+      "loss": 0.5476,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.35213675073648626,
+      "learning_rate": 4.1557380944188184e-05,
+      "loss": 0.549,
+      "step": 3981
+    },
+    {
+      "epoch": 0.7079111111111112,
+      "grad_norm": 0.3504046998050794,
+      "learning_rate": 4.15106671099347e-05,
+      "loss": 0.5597,
+      "step": 3982
+    },
+    {
+      "epoch": 0.7080888888888889,
+      "grad_norm": 0.3663624151216087,
+      "learning_rate": 4.1463972667907845e-05,
+      "loss": 0.5719,
+      "step": 3983
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.35311120741861096,
+      "learning_rate": 4.141729763358925e-05,
+      "loss": 0.5466,
+      "step": 3984
+    },
+    {
+      "epoch": 0.7084444444444444,
+      "grad_norm": 0.335107702821768,
+      "learning_rate": 4.137064202245407e-05,
+      "loss": 0.5842,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7086222222222223,
+      "grad_norm": 0.3604053414337539,
+      "learning_rate": 4.132400584997106e-05,
+      "loss": 0.5994,
+      "step": 3986
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3720568033523875,
+      "learning_rate": 4.1277389131602495e-05,
+      "loss": 0.5714,
+      "step": 3987
+    },
+    {
+      "epoch": 0.7089777777777778,
+      "grad_norm": 0.3900155621358108,
+      "learning_rate": 4.123079188280424e-05,
+      "loss": 0.5968,
+      "step": 3988
+    },
+    {
+      "epoch": 0.7091555555555555,
+      "grad_norm": 0.34689047078560853,
+      "learning_rate": 4.1184214119025676e-05,
+      "loss": 0.5679,
+      "step": 3989
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.3683538834004813,
+      "learning_rate": 4.1137655855709723e-05,
+      "loss": 0.5762,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7095111111111111,
+      "grad_norm": 0.35717293467111283,
+      "learning_rate": 4.1091117108292854e-05,
+      "loss": 0.5982,
+      "step": 3991
+    },
+    {
+      "epoch": 0.7096888888888889,
+      "grad_norm": 0.3564725434720055,
+      "learning_rate": 4.104459789220506e-05,
+      "loss": 0.5981,
+      "step": 3992
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.35998229520275443,
+      "learning_rate": 4.099809822286984e-05,
+      "loss": 0.596,
+      "step": 3993
+    },
+    {
+      "epoch": 0.7100444444444445,
+      "grad_norm": 0.35842618817594113,
+      "learning_rate": 4.095161811570429e-05,
+      "loss": 0.618,
+      "step": 3994
+    },
+    {
+      "epoch": 0.7102222222222222,
+      "grad_norm": 0.36539750101547136,
+      "learning_rate": 4.090515758611884e-05,
+      "loss": 0.5847,
+      "step": 3995
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.37946245521047917,
+      "learning_rate": 4.085871664951769e-05,
+      "loss": 0.5677,
+      "step": 3996
+    },
+    {
+      "epoch": 0.7105777777777778,
+      "grad_norm": 0.35822125396244453,
+      "learning_rate": 4.081229532129827e-05,
+      "loss": 0.5778,
+      "step": 3997
+    },
+    {
+      "epoch": 0.7107555555555556,
+      "grad_norm": 0.3328832917499689,
+      "learning_rate": 4.076589361685177e-05,
+      "loss": 0.539,
+      "step": 3998
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.39820749728525684,
+      "learning_rate": 4.0719511551562606e-05,
+      "loss": 0.5774,
+      "step": 3999
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.3422717473390323,
+      "learning_rate": 4.067314914080898e-05,
+      "loss": 0.5835,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7112888888888889,
+      "grad_norm": 0.352775396067291,
+      "learning_rate": 4.062680639996225e-05,
+      "loss": 0.5528,
+      "step": 4001
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.3411915180281605,
+      "learning_rate": 4.0580483344387586e-05,
+      "loss": 0.5491,
+      "step": 4002
+    },
+    {
+      "epoch": 0.7116444444444444,
+      "grad_norm": 0.37745777867250085,
+      "learning_rate": 4.053417998944331e-05,
+      "loss": 0.5865,
+      "step": 4003
+    },
+    {
+      "epoch": 0.7118222222222222,
+      "grad_norm": 0.3457036361204248,
+      "learning_rate": 4.048789635048154e-05,
+      "loss": 0.6043,
+      "step": 4004
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.34575881214027726,
+      "learning_rate": 4.044163244284753e-05,
+      "loss": 0.5889,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7121777777777778,
+      "grad_norm": 0.3816693732527338,
+      "learning_rate": 4.039538828188023e-05,
+      "loss": 0.5362,
+      "step": 4006
+    },
+    {
+      "epoch": 0.7123555555555555,
+      "grad_norm": 0.3830993084749753,
+      "learning_rate": 4.0349163882911944e-05,
+      "loss": 0.6239,
+      "step": 4007
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.3838765996228099,
+      "learning_rate": 4.030295926126845e-05,
+      "loss": 0.5719,
+      "step": 4008
+    },
+    {
+      "epoch": 0.7127111111111111,
+      "grad_norm": 0.3442245500074592,
+      "learning_rate": 4.025677443226894e-05,
+      "loss": 0.5474,
+      "step": 4009
+    },
+    {
+      "epoch": 0.7128888888888889,
+      "grad_norm": 0.36384519083507694,
+      "learning_rate": 4.0210609411226075e-05,
+      "loss": 0.5823,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.3468376114904197,
+      "learning_rate": 4.016446421344594e-05,
+      "loss": 0.5713,
+      "step": 4011
+    },
+    {
+      "epoch": 0.7132444444444445,
+      "grad_norm": 0.35870762072720874,
+      "learning_rate": 4.0118338854228034e-05,
+      "loss": 0.5849,
+      "step": 4012
+    },
+    {
+      "epoch": 0.7134222222222222,
+      "grad_norm": 0.3540438295911249,
+      "learning_rate": 4.007223334886531e-05,
+      "loss": 0.6114,
+      "step": 4013
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3497239823866297,
+      "learning_rate": 4.0026147712644104e-05,
+      "loss": 0.5804,
+      "step": 4014
+    },
+    {
+      "epoch": 0.7137777777777777,
+      "grad_norm": 0.3378418022451801,
+      "learning_rate": 3.998008196084417e-05,
+      "loss": 0.5747,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7139555555555556,
+      "grad_norm": 0.3488442436582133,
+      "learning_rate": 3.99340361087387e-05,
+      "loss": 0.5768,
+      "step": 4016
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.3555741020905532,
+      "learning_rate": 3.988801017159425e-05,
+      "loss": 0.5682,
+      "step": 4017
+    },
+    {
+      "epoch": 0.7143111111111111,
+      "grad_norm": 0.36376143575835795,
+      "learning_rate": 3.98420041646708e-05,
+      "loss": 0.6142,
+      "step": 4018
+    },
+    {
+      "epoch": 0.7144888888888888,
+      "grad_norm": 0.34565095752321096,
+      "learning_rate": 3.979601810322169e-05,
+      "loss": 0.5899,
+      "step": 4019
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.3515841199041589,
+      "learning_rate": 3.975005200249372e-05,
+      "loss": 0.5976,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7148444444444444,
+      "grad_norm": 0.35633212917359547,
+      "learning_rate": 3.970410587772691e-05,
+      "loss": 0.5795,
+      "step": 4021
+    },
+    {
+      "epoch": 0.7150222222222222,
+      "grad_norm": 0.35423930445191737,
+      "learning_rate": 3.965817974415492e-05,
+      "loss": 0.5708,
+      "step": 4022
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.34987913178714297,
+      "learning_rate": 3.961227361700448e-05,
+      "loss": 0.6183,
+      "step": 4023
+    },
+    {
+      "epoch": 0.7153777777777778,
+      "grad_norm": 0.35004593012021906,
+      "learning_rate": 3.956638751149596e-05,
+      "loss": 0.6109,
+      "step": 4024
+    },
+    {
+      "epoch": 0.7155555555555555,
+      "grad_norm": 0.36854247956585573,
+      "learning_rate": 3.952052144284285e-05,
+      "loss": 0.551,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.35958798106968465,
+      "learning_rate": 3.947467542625225e-05,
+      "loss": 0.6042,
+      "step": 4026
+    },
+    {
+      "epoch": 0.7159111111111112,
+      "grad_norm": 0.34431588711996347,
+      "learning_rate": 3.9428849476924325e-05,
+      "loss": 0.5873,
+      "step": 4027
+    },
+    {
+      "epoch": 0.7160888888888889,
+      "grad_norm": 0.33196810414443173,
+      "learning_rate": 3.9383043610052885e-05,
+      "loss": 0.4939,
+      "step": 4028
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.3430989019721864,
+      "learning_rate": 3.933725784082483e-05,
+      "loss": 0.5867,
+      "step": 4029
+    },
+    {
+      "epoch": 0.7164444444444444,
+      "grad_norm": 0.3524933130613355,
+      "learning_rate": 3.929149218442052e-05,
+      "loss": 0.5634,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7166222222222223,
+      "grad_norm": 0.37469501040717856,
+      "learning_rate": 3.924574665601366e-05,
+      "loss": 0.5826,
+      "step": 4031
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.35083104199648535,
+      "learning_rate": 3.920002127077123e-05,
+      "loss": 0.5904,
+      "step": 4032
+    },
+    {
+      "epoch": 0.7169777777777778,
+      "grad_norm": 0.3724475501147572,
+      "learning_rate": 3.915431604385355e-05,
+      "loss": 0.5904,
+      "step": 4033
+    },
+    {
+      "epoch": 0.7171555555555555,
+      "grad_norm": 0.34618822255733345,
+      "learning_rate": 3.910863099041424e-05,
+      "loss": 0.5483,
+      "step": 4034
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.3601642914062829,
+      "learning_rate": 3.9062966125600284e-05,
+      "loss": 0.5803,
+      "step": 4035
+    },
+    {
+      "epoch": 0.7175111111111111,
+      "grad_norm": 0.36856301709854145,
+      "learning_rate": 3.901732146455193e-05,
+      "loss": 0.5816,
+      "step": 4036
+    },
+    {
+      "epoch": 0.7176888888888889,
+      "grad_norm": 0.36459665313297895,
+      "learning_rate": 3.897169702240271e-05,
+      "loss": 0.5928,
+      "step": 4037
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.35997596044762503,
+      "learning_rate": 3.892609281427949e-05,
+      "loss": 0.5516,
+      "step": 4038
+    },
+    {
+      "epoch": 0.7180444444444445,
+      "grad_norm": 0.33535915386543563,
+      "learning_rate": 3.8880508855302425e-05,
+      "loss": 0.5719,
+      "step": 4039
+    },
+    {
+      "epoch": 0.7182222222222222,
+      "grad_norm": 0.35264025990849457,
+      "learning_rate": 3.8834945160584924e-05,
+      "loss": 0.5647,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.35588030402193865,
+      "learning_rate": 3.878940174523371e-05,
+      "loss": 0.5785,
+      "step": 4041
+    },
+    {
+      "epoch": 0.7185777777777778,
+      "grad_norm": 0.36948990063759324,
+      "learning_rate": 3.8743878624348785e-05,
+      "loss": 0.5777,
+      "step": 4042
+    },
+    {
+      "epoch": 0.7187555555555556,
+      "grad_norm": 0.36018982939033206,
+      "learning_rate": 3.869837581302338e-05,
+      "loss": 0.5814,
+      "step": 4043
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.3551419485843042,
+      "learning_rate": 3.865289332634407e-05,
+      "loss": 0.527,
+      "step": 4044
+    },
+    {
+      "epoch": 0.7191111111111111,
+      "grad_norm": 0.3348241237514712,
+      "learning_rate": 3.860743117939055e-05,
+      "loss": 0.5448,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7192888888888889,
+      "grad_norm": 0.3690214567390772,
+      "learning_rate": 3.856198938723598e-05,
+      "loss": 0.5956,
+      "step": 4046
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.363777524524831,
+      "learning_rate": 3.851656796494654e-05,
+      "loss": 0.5784,
+      "step": 4047
+    },
+    {
+      "epoch": 0.7196444444444444,
+      "grad_norm": 0.3642390604505968,
+      "learning_rate": 3.847116692758189e-05,
+      "loss": 0.5884,
+      "step": 4048
+    },
+    {
+      "epoch": 0.7198222222222223,
+      "grad_norm": 0.34675180166115704,
+      "learning_rate": 3.8425786290194676e-05,
+      "loss": 0.5721,
+      "step": 4049
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.33592207755262066,
+      "learning_rate": 3.838042606783106e-05,
+      "loss": 0.539,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7201777777777778,
+      "grad_norm": 0.34386299621856947,
+      "learning_rate": 3.833508627553016e-05,
+      "loss": 0.5481,
+      "step": 4051
+    },
+    {
+      "epoch": 0.7203555555555555,
+      "grad_norm": 0.38573054045041,
+      "learning_rate": 3.828976692832458e-05,
+      "loss": 0.532,
+      "step": 4052
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.41869844148188573,
+      "learning_rate": 3.824446804123992e-05,
+      "loss": 0.5444,
+      "step": 4053
+    },
+    {
+      "epoch": 0.7207111111111111,
+      "grad_norm": 0.34660825949047447,
+      "learning_rate": 3.819918962929513e-05,
+      "loss": 0.5816,
+      "step": 4054
+    },
+    {
+      "epoch": 0.7208888888888889,
+      "grad_norm": 0.32896659407585827,
+      "learning_rate": 3.815393170750232e-05,
+      "loss": 0.5408,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.37168262839612937,
+      "learning_rate": 3.810869429086685e-05,
+      "loss": 0.6041,
+      "step": 4056
+    },
+    {
+      "epoch": 0.7212444444444445,
+      "grad_norm": 0.3892816750713322,
+      "learning_rate": 3.806347739438724e-05,
+      "loss": 0.6131,
+      "step": 4057
+    },
+    {
+      "epoch": 0.7214222222222222,
+      "grad_norm": 0.37509559352770205,
+      "learning_rate": 3.801828103305521e-05,
+      "loss": 0.5295,
+      "step": 4058
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.34406718198370034,
+      "learning_rate": 3.79731052218557e-05,
+      "loss": 0.5656,
+      "step": 4059
+    },
+    {
+      "epoch": 0.7217777777777777,
+      "grad_norm": 0.3633341354228841,
+      "learning_rate": 3.792794997576681e-05,
+      "loss": 0.6319,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7219555555555556,
+      "grad_norm": 0.3558423098248106,
+      "learning_rate": 3.7882815309759824e-05,
+      "loss": 0.5751,
+      "step": 4061
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.38394086631127355,
+      "learning_rate": 3.7837701238799216e-05,
+      "loss": 0.5474,
+      "step": 4062
+    },
+    {
+      "epoch": 0.7223111111111111,
+      "grad_norm": 0.34309866648557463,
+      "learning_rate": 3.779260777784263e-05,
+      "loss": 0.5832,
+      "step": 4063
+    },
+    {
+      "epoch": 0.7224888888888888,
+      "grad_norm": 0.34719677290897155,
+      "learning_rate": 3.7747534941840854e-05,
+      "loss": 0.5959,
+      "step": 4064
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.3712393325948241,
+      "learning_rate": 3.7702482745737874e-05,
+      "loss": 0.6139,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7228444444444444,
+      "grad_norm": 0.3443376122367561,
+      "learning_rate": 3.765745120447081e-05,
+      "loss": 0.5326,
+      "step": 4066
+    },
+    {
+      "epoch": 0.7230222222222222,
+      "grad_norm": 0.36927227522192413,
+      "learning_rate": 3.761244033296992e-05,
+      "loss": 0.5395,
+      "step": 4067
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3518835753039752,
+      "learning_rate": 3.756745014615868e-05,
+      "loss": 0.588,
+      "step": 4068
+    },
+    {
+      "epoch": 0.7233777777777778,
+      "grad_norm": 0.3534793121640155,
+      "learning_rate": 3.752248065895354e-05,
+      "loss": 0.635,
+      "step": 4069
+    },
+    {
+      "epoch": 0.7235555555555555,
+      "grad_norm": 0.39624142216125907,
+      "learning_rate": 3.747753188626434e-05,
+      "loss": 0.5726,
+      "step": 4070
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.3855795554773093,
+      "learning_rate": 3.7432603842993786e-05,
+      "loss": 0.5975,
+      "step": 4071
+    },
+    {
+      "epoch": 0.7239111111111111,
+      "grad_norm": 0.3499491109496454,
+      "learning_rate": 3.738769654403796e-05,
+      "loss": 0.5854,
+      "step": 4072
+    },
+    {
+      "epoch": 0.7240888888888889,
+      "grad_norm": 0.350011506488243,
+      "learning_rate": 3.7342810004285836e-05,
+      "loss": 0.5427,
+      "step": 4073
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.33381178021937996,
+      "learning_rate": 3.7297944238619706e-05,
+      "loss": 0.5577,
+      "step": 4074
+    },
+    {
+      "epoch": 0.7244444444444444,
+      "grad_norm": 0.3571728387181711,
+      "learning_rate": 3.725309926191479e-05,
+      "loss": 0.5613,
+      "step": 4075
+    },
+    {
+      "epoch": 0.7246222222222222,
+      "grad_norm": 0.35226413538193374,
+      "learning_rate": 3.720827508903962e-05,
+      "loss": 0.5745,
+      "step": 4076
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.34273982433556716,
+      "learning_rate": 3.716347173485563e-05,
+      "loss": 0.551,
+      "step": 4077
+    },
+    {
+      "epoch": 0.7249777777777778,
+      "grad_norm": 0.3508742438452673,
+      "learning_rate": 3.711868921421745e-05,
+      "loss": 0.5621,
+      "step": 4078
+    },
+    {
+      "epoch": 0.7251555555555556,
+      "grad_norm": 0.35235341107733753,
+      "learning_rate": 3.707392754197281e-05,
+      "loss": 0.5797,
+      "step": 4079
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.36041693800123004,
+      "learning_rate": 3.7029186732962515e-05,
+      "loss": 0.571,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7255111111111111,
+      "grad_norm": 0.3341291652682757,
+      "learning_rate": 3.6984466802020436e-05,
+      "loss": 0.5835,
+      "step": 4081
+    },
+    {
+      "epoch": 0.7256888888888889,
+      "grad_norm": 0.3484984545996794,
+      "learning_rate": 3.6939767763973545e-05,
+      "loss": 0.5918,
+      "step": 4082
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.33309187853905786,
+      "learning_rate": 3.6895089633641856e-05,
+      "loss": 0.5734,
+      "step": 4083
+    },
+    {
+      "epoch": 0.7260444444444445,
+      "grad_norm": 0.35480576818029674,
+      "learning_rate": 3.6850432425838485e-05,
+      "loss": 0.5579,
+      "step": 4084
+    },
+    {
+      "epoch": 0.7262222222222222,
+      "grad_norm": 0.34738002862697276,
+      "learning_rate": 3.680579615536961e-05,
+      "loss": 0.5504,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3443099710650306,
+      "learning_rate": 3.676118083703442e-05,
+      "loss": 0.593,
+      "step": 4086
+    },
+    {
+      "epoch": 0.7265777777777778,
+      "grad_norm": 0.3610581388304893,
+      "learning_rate": 3.671658648562523e-05,
+      "loss": 0.5769,
+      "step": 4087
+    },
+    {
+      "epoch": 0.7267555555555556,
+      "grad_norm": 0.34144703749747113,
+      "learning_rate": 3.667201311592733e-05,
+      "loss": 0.5748,
+      "step": 4088
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.35182172456475097,
+      "learning_rate": 3.66274607427191e-05,
+      "loss": 0.5783,
+      "step": 4089
+    },
+    {
+      "epoch": 0.7271111111111112,
+      "grad_norm": 0.44634107103541587,
+      "learning_rate": 3.6582929380771956e-05,
+      "loss": 0.5867,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7272888888888889,
+      "grad_norm": 0.3976661646796536,
+      "learning_rate": 3.6538419044850335e-05,
+      "loss": 0.6094,
+      "step": 4091
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.5028916670005593,
+      "learning_rate": 3.6493929749711734e-05,
+      "loss": 0.5832,
+      "step": 4092
+    },
+    {
+      "epoch": 0.7276444444444444,
+      "grad_norm": 0.35196196607357777,
+      "learning_rate": 3.644946151010654e-05,
+      "loss": 0.5778,
+      "step": 4093
+    },
+    {
+      "epoch": 0.7278222222222223,
+      "grad_norm": 0.35633831307443015,
+      "learning_rate": 3.640501434077841e-05,
+      "loss": 0.5873,
+      "step": 4094
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3451010902823216,
+      "learning_rate": 3.6360588256463734e-05,
+      "loss": 0.54,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7281777777777778,
+      "grad_norm": 0.3513175148107401,
+      "learning_rate": 3.631618327189218e-05,
+      "loss": 0.5679,
+      "step": 4096
+    },
+    {
+      "epoch": 0.7283555555555555,
+      "grad_norm": 0.3798334042299023,
+      "learning_rate": 3.627179940178615e-05,
+      "loss": 0.6058,
+      "step": 4097
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.42111831238176,
+      "learning_rate": 3.622743666086132e-05,
+      "loss": 0.5997,
+      "step": 4098
+    },
+    {
+      "epoch": 0.7287111111111111,
+      "grad_norm": 0.4005910154856992,
+      "learning_rate": 3.61830950638261e-05,
+      "loss": 0.536,
+      "step": 4099
+    },
+    {
+      "epoch": 0.7288888888888889,
+      "grad_norm": 0.3601110817172015,
+      "learning_rate": 3.6138774625382134e-05,
+      "loss": 0.5917,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.3582316720363536,
+      "learning_rate": 3.609447536022379e-05,
+      "loss": 0.5859,
+      "step": 4101
+    },
+    {
+      "epoch": 0.7292444444444445,
+      "grad_norm": 0.3600353042347371,
+      "learning_rate": 3.605019728303871e-05,
+      "loss": 0.5647,
+      "step": 4102
+    },
+    {
+      "epoch": 0.7294222222222222,
+      "grad_norm": 0.33023332277899525,
+      "learning_rate": 3.600594040850724e-05,
+      "loss": 0.558,
+      "step": 4103
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3414980810119145,
+      "learning_rate": 3.596170475130287e-05,
+      "loss": 0.5708,
+      "step": 4104
+    },
+    {
+      "epoch": 0.7297777777777777,
+      "grad_norm": 0.36301467661581527,
+      "learning_rate": 3.591749032609197e-05,
+      "loss": 0.5392,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7299555555555556,
+      "grad_norm": 0.3489219650050646,
+      "learning_rate": 3.5873297147533915e-05,
+      "loss": 0.5762,
+      "step": 4106
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.33037371048255665,
+      "learning_rate": 3.582912523028101e-05,
+      "loss": 0.5223,
+      "step": 4107
+    },
+    {
+      "epoch": 0.7303111111111111,
+      "grad_norm": 0.39428863615782694,
+      "learning_rate": 3.5784974588978545e-05,
+      "loss": 0.5553,
+      "step": 4108
+    },
+    {
+      "epoch": 0.7304888888888889,
+      "grad_norm": 0.35566476944171455,
+      "learning_rate": 3.574084523826471e-05,
+      "loss": 0.5544,
+      "step": 4109
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.37357581249243205,
+      "learning_rate": 3.569673719277066e-05,
+      "loss": 0.5891,
+      "step": 4110
+    },
+    {
+      "epoch": 0.7308444444444444,
+      "grad_norm": 0.42399272556018314,
+      "learning_rate": 3.5652650467120485e-05,
+      "loss": 0.5836,
+      "step": 4111
+    },
+    {
+      "epoch": 0.7310222222222222,
+      "grad_norm": 0.3583603897360114,
+      "learning_rate": 3.5608585075931214e-05,
+      "loss": 0.5504,
+      "step": 4112
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.34530773141229776,
+      "learning_rate": 3.556454103381278e-05,
+      "loss": 0.5627,
+      "step": 4113
+    },
+    {
+      "epoch": 0.7313777777777778,
+      "grad_norm": 0.42322166626875707,
+      "learning_rate": 3.552051835536807e-05,
+      "loss": 0.6473,
+      "step": 4114
+    },
+    {
+      "epoch": 0.7315555555555555,
+      "grad_norm": 0.35419830900647725,
+      "learning_rate": 3.547651705519285e-05,
+      "loss": 0.564,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.35548346764348576,
+      "learning_rate": 3.543253714787583e-05,
+      "loss": 0.5713,
+      "step": 4116
+    },
+    {
+      "epoch": 0.7319111111111111,
+      "grad_norm": 0.3353066853989554,
+      "learning_rate": 3.538857864799862e-05,
+      "loss": 0.564,
+      "step": 4117
+    },
+    {
+      "epoch": 0.7320888888888889,
+      "grad_norm": 0.33495408069979127,
+      "learning_rate": 3.534464157013574e-05,
+      "loss": 0.5566,
+      "step": 4118
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.4003992692488059,
+      "learning_rate": 3.530072592885451e-05,
+      "loss": 0.6307,
+      "step": 4119
+    },
+    {
+      "epoch": 0.7324444444444445,
+      "grad_norm": 0.4072528101181034,
+      "learning_rate": 3.5256831738715345e-05,
+      "loss": 0.5914,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7326222222222222,
+      "grad_norm": 0.3290746062081954,
+      "learning_rate": 3.521295901427132e-05,
+      "loss": 0.5625,
+      "step": 4121
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.36993259684977065,
+      "learning_rate": 3.516910777006862e-05,
+      "loss": 0.6043,
+      "step": 4122
+    },
+    {
+      "epoch": 0.7329777777777777,
+      "grad_norm": 0.3654148060441524,
+      "learning_rate": 3.512527802064607e-05,
+      "loss": 0.6229,
+      "step": 4123
+    },
+    {
+      "epoch": 0.7331555555555556,
+      "grad_norm": 0.36813705642004046,
+      "learning_rate": 3.508146978053562e-05,
+      "loss": 0.5839,
+      "step": 4124
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.3625315228535952,
+      "learning_rate": 3.5037683064261806e-05,
+      "loss": 0.5463,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7335111111111111,
+      "grad_norm": 0.34439952872635465,
+      "learning_rate": 3.4993917886342334e-05,
+      "loss": 0.5528,
+      "step": 4126
+    },
+    {
+      "epoch": 0.7336888888888888,
+      "grad_norm": 0.3377119460572574,
+      "learning_rate": 3.4950174261287504e-05,
+      "loss": 0.5434,
+      "step": 4127
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.3660037048768086,
+      "learning_rate": 3.4906452203600616e-05,
+      "loss": 0.5889,
+      "step": 4128
+    },
+    {
+      "epoch": 0.7340444444444445,
+      "grad_norm": 0.4348114832931297,
+      "learning_rate": 3.4862751727777797e-05,
+      "loss": 0.5815,
+      "step": 4129
+    },
+    {
+      "epoch": 0.7342222222222222,
+      "grad_norm": 0.36818651603667996,
+      "learning_rate": 3.4819072848307986e-05,
+      "loss": 0.5486,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.35238484136797127,
+      "learning_rate": 3.4775415579673e-05,
+      "loss": 0.5792,
+      "step": 4131
+    },
+    {
+      "epoch": 0.7345777777777778,
+      "grad_norm": 0.35053655269699074,
+      "learning_rate": 3.473177993634745e-05,
+      "loss": 0.5106,
+      "step": 4132
+    },
+    {
+      "epoch": 0.7347555555555556,
+      "grad_norm": 0.33947758882610224,
+      "learning_rate": 3.468816593279883e-05,
+      "loss": 0.5689,
+      "step": 4133
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.3472516389991502,
+      "learning_rate": 3.4644573583487404e-05,
+      "loss": 0.5507,
+      "step": 4134
+    },
+    {
+      "epoch": 0.7351111111111112,
+      "grad_norm": 0.3383343297937684,
+      "learning_rate": 3.4601002902866284e-05,
+      "loss": 0.5468,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7352888888888889,
+      "grad_norm": 0.3486170421568541,
+      "learning_rate": 3.455745390538141e-05,
+      "loss": 0.5787,
+      "step": 4136
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.3539950870947957,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.5639,
+      "step": 4137
+    },
+    {
+      "epoch": 0.7356444444444444,
+      "grad_norm": 0.3531963654280004,
+      "learning_rate": 3.44704210175681e-05,
+      "loss": 0.594,
+      "step": 4138
+    },
+    {
+      "epoch": 0.7358222222222223,
+      "grad_norm": 0.3345096244368699,
+      "learning_rate": 3.4426937156095563e-05,
+      "loss": 0.557,
+      "step": 4139
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3463203137827246,
+      "learning_rate": 3.438347503547102e-05,
+      "loss": 0.5437,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7361777777777778,
+      "grad_norm": 0.35629817370214173,
+      "learning_rate": 3.434003467010441e-05,
+      "loss": 0.5686,
+      "step": 4141
+    },
+    {
+      "epoch": 0.7363555555555555,
+      "grad_norm": 0.3622511641958309,
+      "learning_rate": 3.429661607439848e-05,
+      "loss": 0.5954,
+      "step": 4142
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.3409325476288989,
+      "learning_rate": 3.425321926274863e-05,
+      "loss": 0.5354,
+      "step": 4143
+    },
+    {
+      "epoch": 0.7367111111111111,
+      "grad_norm": 0.33337697753647727,
+      "learning_rate": 3.420984424954328e-05,
+      "loss": 0.5295,
+      "step": 4144
+    },
+    {
+      "epoch": 0.7368888888888889,
+      "grad_norm": 0.3623020617043478,
+      "learning_rate": 3.416649104916333e-05,
+      "loss": 0.5942,
+      "step": 4145
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.36944463521944043,
+      "learning_rate": 3.412315967598274e-05,
+      "loss": 0.5389,
+      "step": 4146
+    },
+    {
+      "epoch": 0.7372444444444445,
+      "grad_norm": 0.34130609525144595,
+      "learning_rate": 3.407985014436797e-05,
+      "loss": 0.5818,
+      "step": 4147
+    },
+    {
+      "epoch": 0.7374222222222222,
+      "grad_norm": 0.35987167085537464,
+      "learning_rate": 3.403656246867849e-05,
+      "loss": 0.5959,
+      "step": 4148
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.34565129078536555,
+      "learning_rate": 3.399329666326625e-05,
+      "loss": 0.6006,
+      "step": 4149
+    },
+    {
+      "epoch": 0.7377777777777778,
+      "grad_norm": 0.36100202026702954,
+      "learning_rate": 3.3950052742476245e-05,
+      "loss": 0.5615,
+      "step": 4150
+    },
+    {
+      "epoch": 0.7379555555555556,
+      "grad_norm": 0.3300455130205511,
+      "learning_rate": 3.390683072064594e-05,
+      "loss": 0.5434,
+      "step": 4151
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.3676732014123496,
+      "learning_rate": 3.386363061210571e-05,
+      "loss": 0.6191,
+      "step": 4152
+    },
+    {
+      "epoch": 0.7383111111111111,
+      "grad_norm": 0.35786896139702584,
+      "learning_rate": 3.3820452431178606e-05,
+      "loss": 0.5757,
+      "step": 4153
+    },
+    {
+      "epoch": 0.7384888888888889,
+      "grad_norm": 0.3650214589300631,
+      "learning_rate": 3.377729619218043e-05,
+      "loss": 0.5624,
+      "step": 4154
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.3465952080971314,
+      "learning_rate": 3.3734161909419695e-05,
+      "loss": 0.5983,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7388444444444444,
+      "grad_norm": 0.36796677731123845,
+      "learning_rate": 3.369104959719763e-05,
+      "loss": 0.578,
+      "step": 4156
+    },
+    {
+      "epoch": 0.7390222222222222,
+      "grad_norm": 0.39879091582494125,
+      "learning_rate": 3.3647959269808205e-05,
+      "loss": 0.5241,
+      "step": 4157
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3436222373377845,
+      "learning_rate": 3.360489094153806e-05,
+      "loss": 0.5967,
+      "step": 4158
+    },
+    {
+      "epoch": 0.7393777777777778,
+      "grad_norm": 0.34481521565108186,
+      "learning_rate": 3.356184462666658e-05,
+      "loss": 0.6108,
+      "step": 4159
+    },
+    {
+      "epoch": 0.7395555555555555,
+      "grad_norm": 0.38876036512875123,
+      "learning_rate": 3.351882033946583e-05,
+      "loss": 0.6192,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.4868480567014935,
+      "learning_rate": 3.3475818094200585e-05,
+      "loss": 0.6012,
+      "step": 4161
+    },
+    {
+      "epoch": 0.7399111111111111,
+      "grad_norm": 0.3418390358719068,
+      "learning_rate": 3.343283790512829e-05,
+      "loss": 0.5624,
+      "step": 4162
+    },
+    {
+      "epoch": 0.7400888888888889,
+      "grad_norm": 0.36078330953090015,
+      "learning_rate": 3.33898797864991e-05,
+      "loss": 0.5807,
+      "step": 4163
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.3564857528496916,
+      "learning_rate": 3.334694375255585e-05,
+      "loss": 0.5422,
+      "step": 4164
+    },
+    {
+      "epoch": 0.7404444444444445,
+      "grad_norm": 0.3683482387828699,
+      "learning_rate": 3.330402981753403e-05,
+      "loss": 0.5646,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7406222222222222,
+      "grad_norm": 0.3449637883678767,
+      "learning_rate": 3.326113799566187e-05,
+      "loss": 0.5506,
+      "step": 4166
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3658986217297458,
+      "learning_rate": 3.321826830116012e-05,
+      "loss": 0.5539,
+      "step": 4167
+    },
+    {
+      "epoch": 0.7409777777777777,
+      "grad_norm": 0.35742008880722825,
+      "learning_rate": 3.3175420748242406e-05,
+      "loss": 0.568,
+      "step": 4168
+    },
+    {
+      "epoch": 0.7411555555555556,
+      "grad_norm": 0.3422450316940728,
+      "learning_rate": 3.313259535111478e-05,
+      "loss": 0.5586,
+      "step": 4169
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.39071089627359307,
+      "learning_rate": 3.3089792123976195e-05,
+      "loss": 0.5969,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7415111111111111,
+      "grad_norm": 0.3349522184099262,
+      "learning_rate": 3.3047011081018e-05,
+      "loss": 0.5461,
+      "step": 4171
+    },
+    {
+      "epoch": 0.7416888888888888,
+      "grad_norm": 0.33970565814244474,
+      "learning_rate": 3.300425223642444e-05,
+      "loss": 0.5523,
+      "step": 4172
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.3460660317974209,
+      "learning_rate": 3.296151560437214e-05,
+      "loss": 0.5691,
+      "step": 4173
+    },
+    {
+      "epoch": 0.7420444444444444,
+      "grad_norm": 0.35764984100815206,
+      "learning_rate": 3.2918801199030635e-05,
+      "loss": 0.5428,
+      "step": 4174
+    },
+    {
+      "epoch": 0.7422222222222222,
+      "grad_norm": 0.3317173069594749,
+      "learning_rate": 3.287610903456181e-05,
+      "loss": 0.5737,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.337324590260528,
+      "learning_rate": 3.283343912512046e-05,
+      "loss": 0.562,
+      "step": 4176
+    },
+    {
+      "epoch": 0.7425777777777778,
+      "grad_norm": 0.35698393268935386,
+      "learning_rate": 3.279079148485375e-05,
+      "loss": 0.573,
+      "step": 4177
+    },
+    {
+      "epoch": 0.7427555555555555,
+      "grad_norm": 0.4109720519173487,
+      "learning_rate": 3.27481661279016e-05,
+      "loss": 0.6212,
+      "step": 4178
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.3387820550022395,
+      "learning_rate": 3.2705563068396514e-05,
+      "loss": 0.5678,
+      "step": 4179
+    },
+    {
+      "epoch": 0.7431111111111111,
+      "grad_norm": 0.35281991014179076,
+      "learning_rate": 3.266298232046362e-05,
+      "loss": 0.5521,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7432888888888889,
+      "grad_norm": 0.34370307132414746,
+      "learning_rate": 3.26204238982206e-05,
+      "loss": 0.5439,
+      "step": 4181
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.48350048148904473,
+      "learning_rate": 3.257788781577777e-05,
+      "loss": 0.5635,
+      "step": 4182
+    },
+    {
+      "epoch": 0.7436444444444444,
+      "grad_norm": 0.35462018137979245,
+      "learning_rate": 3.253537408723805e-05,
+      "loss": 0.5497,
+      "step": 4183
+    },
+    {
+      "epoch": 0.7438222222222223,
+      "grad_norm": 0.33374514836867014,
+      "learning_rate": 3.249288272669691e-05,
+      "loss": 0.5454,
+      "step": 4184
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.36179135386948374,
+      "learning_rate": 3.2450413748242437e-05,
+      "loss": 0.5886,
+      "step": 4185
+    },
+    {
+      "epoch": 0.7441777777777778,
+      "grad_norm": 0.35322559599523196,
+      "learning_rate": 3.240796716595528e-05,
+      "loss": 0.5571,
+      "step": 4186
+    },
+    {
+      "epoch": 0.7443555555555555,
+      "grad_norm": 0.3521601650138431,
+      "learning_rate": 3.236554299390866e-05,
+      "loss": 0.5457,
+      "step": 4187
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.3528364935475116,
+      "learning_rate": 3.2323141246168396e-05,
+      "loss": 0.5872,
+      "step": 4188
+    },
+    {
+      "epoch": 0.7447111111111111,
+      "grad_norm": 0.34858074322798915,
+      "learning_rate": 3.2280761936792837e-05,
+      "loss": 0.582,
+      "step": 4189
+    },
+    {
+      "epoch": 0.7448888888888889,
+      "grad_norm": 0.3862473146426205,
+      "learning_rate": 3.2238405079832936e-05,
+      "loss": 0.5487,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.3512183257338081,
+      "learning_rate": 3.219607068933208e-05,
+      "loss": 0.5913,
+      "step": 4191
+    },
+    {
+      "epoch": 0.7452444444444445,
+      "grad_norm": 0.35140397046037664,
+      "learning_rate": 3.2153758779326435e-05,
+      "loss": 0.5792,
+      "step": 4192
+    },
+    {
+      "epoch": 0.7454222222222222,
+      "grad_norm": 0.34880024133834064,
+      "learning_rate": 3.211146936384445e-05,
+      "loss": 0.5893,
+      "step": 4193
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3924495259050675,
+      "learning_rate": 3.2069202456907366e-05,
+      "loss": 0.5931,
+      "step": 4194
+    },
+    {
+      "epoch": 0.7457777777777778,
+      "grad_norm": 0.34911644870176195,
+      "learning_rate": 3.202695807252871e-05,
+      "loss": 0.554,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7459555555555556,
+      "grad_norm": 0.3348938294170673,
+      "learning_rate": 3.1984736224714816e-05,
+      "loss": 0.5532,
+      "step": 4196
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.35132961633684906,
+      "learning_rate": 3.194253692746425e-05,
+      "loss": 0.5817,
+      "step": 4197
+    },
+    {
+      "epoch": 0.7463111111111111,
+      "grad_norm": 0.3206817272795673,
+      "learning_rate": 3.19003601947684e-05,
+      "loss": 0.5528,
+      "step": 4198
+    },
+    {
+      "epoch": 0.7464888888888889,
+      "grad_norm": 0.32968430977894264,
+      "learning_rate": 3.185820604061088e-05,
+      "loss": 0.5327,
+      "step": 4199
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.3387615682396391,
+      "learning_rate": 3.1816074478968106e-05,
+      "loss": 0.5469,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7468444444444444,
+      "grad_norm": 0.3718264749895764,
+      "learning_rate": 3.1773965523808754e-05,
+      "loss": 0.6096,
+      "step": 4201
+    },
+    {
+      "epoch": 0.7470222222222223,
+      "grad_norm": 0.3650002592452039,
+      "learning_rate": 3.173187918909416e-05,
+      "loss": 0.6021,
+      "step": 4202
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.3415189432822939,
+      "learning_rate": 3.1689815488778096e-05,
+      "loss": 0.5631,
+      "step": 4203
+    },
+    {
+      "epoch": 0.7473777777777778,
+      "grad_norm": 0.3428618174151365,
+      "learning_rate": 3.164777443680684e-05,
+      "loss": 0.5772,
+      "step": 4204
+    },
+    {
+      "epoch": 0.7475555555555555,
+      "grad_norm": 0.33093855710863396,
+      "learning_rate": 3.16057560471192e-05,
+      "loss": 0.5829,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.3498256505756481,
+      "learning_rate": 3.1563760333646395e-05,
+      "loss": 0.5645,
+      "step": 4206
+    },
+    {
+      "epoch": 0.7479111111111111,
+      "grad_norm": 0.3577293233560542,
+      "learning_rate": 3.152178731031219e-05,
+      "loss": 0.6156,
+      "step": 4207
+    },
+    {
+      "epoch": 0.7480888888888889,
+      "grad_norm": 0.34255333849627845,
+      "learning_rate": 3.14798369910328e-05,
+      "loss": 0.5747,
+      "step": 4208
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.33516520282152806,
+      "learning_rate": 3.1437909389716915e-05,
+      "loss": 0.5747,
+      "step": 4209
+    },
+    {
+      "epoch": 0.7484444444444445,
+      "grad_norm": 0.3442120636670075,
+      "learning_rate": 3.13960045202657e-05,
+      "loss": 0.6036,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7486222222222222,
+      "grad_norm": 0.3731906782420752,
+      "learning_rate": 3.1354122396572774e-05,
+      "loss": 0.6107,
+      "step": 4211
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.33151817581592286,
+      "learning_rate": 3.1312263032524216e-05,
+      "loss": 0.5398,
+      "step": 4212
+    },
+    {
+      "epoch": 0.7489777777777777,
+      "grad_norm": 0.3468051855372648,
+      "learning_rate": 3.127042644199856e-05,
+      "loss": 0.5577,
+      "step": 4213
+    },
+    {
+      "epoch": 0.7491555555555556,
+      "grad_norm": 0.3631674818492094,
+      "learning_rate": 3.1228612638866795e-05,
+      "loss": 0.5899,
+      "step": 4214
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.38503654967646217,
+      "learning_rate": 3.118682163699236e-05,
+      "loss": 0.593,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7495111111111111,
+      "grad_norm": 0.3266389435776212,
+      "learning_rate": 3.114505345023113e-05,
+      "loss": 0.5386,
+      "step": 4216
+    },
+    {
+      "epoch": 0.7496888888888888,
+      "grad_norm": 0.34972426476361096,
+      "learning_rate": 3.110330809243134e-05,
+      "loss": 0.5787,
+      "step": 4217
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.3417197018503155,
+      "learning_rate": 3.106158557743385e-05,
+      "loss": 0.5669,
+      "step": 4218
+    },
+    {
+      "epoch": 0.7500444444444444,
+      "grad_norm": 0.35981740931242173,
+      "learning_rate": 3.101988591907168e-05,
+      "loss": 0.5593,
+      "step": 4219
+    },
+    {
+      "epoch": 0.7502222222222222,
+      "grad_norm": 0.3528973730962451,
+      "learning_rate": 3.0978209131170566e-05,
+      "loss": 0.5666,
+      "step": 4220
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.33954281288581745,
+      "learning_rate": 3.0936555227548355e-05,
+      "loss": 0.5521,
+      "step": 4221
+    },
+    {
+      "epoch": 0.7505777777777778,
+      "grad_norm": 0.3488425071859353,
+      "learning_rate": 3.089492422201561e-05,
+      "loss": 0.5729,
+      "step": 4222
+    },
+    {
+      "epoch": 0.7507555555555555,
+      "grad_norm": 0.3272921881612388,
+      "learning_rate": 3.085331612837502e-05,
+      "loss": 0.5368,
+      "step": 4223
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.36300224569083905,
+      "learning_rate": 3.081173096042194e-05,
+      "loss": 0.6139,
+      "step": 4224
+    },
+    {
+      "epoch": 0.7511111111111111,
+      "grad_norm": 0.3574674623555468,
+      "learning_rate": 3.0770168731943895e-05,
+      "loss": 0.593,
+      "step": 4225
+    },
+    {
+      "epoch": 0.7512888888888889,
+      "grad_norm": 0.3511692874694419,
+      "learning_rate": 3.072862945672094e-05,
+      "loss": 0.5444,
+      "step": 4226
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.3582921225868174,
+      "learning_rate": 3.068711314852548e-05,
+      "loss": 0.5633,
+      "step": 4227
+    },
+    {
+      "epoch": 0.7516444444444444,
+      "grad_norm": 0.3688101911355421,
+      "learning_rate": 3.064561982112232e-05,
+      "loss": 0.5409,
+      "step": 4228
+    },
+    {
+      "epoch": 0.7518222222222222,
+      "grad_norm": 0.3686330421240707,
+      "learning_rate": 3.060414948826862e-05,
+      "loss": 0.6031,
+      "step": 4229
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.38082723294809234,
+      "learning_rate": 3.056270216371395e-05,
+      "loss": 0.616,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7521777777777777,
+      "grad_norm": 0.38249101558826504,
+      "learning_rate": 3.0521277861200216e-05,
+      "loss": 0.5973,
+      "step": 4231
+    },
+    {
+      "epoch": 0.7523555555555556,
+      "grad_norm": 0.36227280753592483,
+      "learning_rate": 3.047987659446172e-05,
+      "loss": 0.5794,
+      "step": 4232
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.38258056128230283,
+      "learning_rate": 3.043849837722511e-05,
+      "loss": 0.6031,
+      "step": 4233
+    },
+    {
+      "epoch": 0.7527111111111111,
+      "grad_norm": 0.34606211062994974,
+      "learning_rate": 3.039714322320939e-05,
+      "loss": 0.6047,
+      "step": 4234
+    },
+    {
+      "epoch": 0.7528888888888889,
+      "grad_norm": 0.32825314961616503,
+      "learning_rate": 3.0355811146125935e-05,
+      "loss": 0.5289,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.34763060449947897,
+      "learning_rate": 3.0314502159678458e-05,
+      "loss": 0.5481,
+      "step": 4236
+    },
+    {
+      "epoch": 0.7532444444444445,
+      "grad_norm": 0.35145590639389757,
+      "learning_rate": 3.0273216277563e-05,
+      "loss": 0.5829,
+      "step": 4237
+    },
+    {
+      "epoch": 0.7534222222222222,
+      "grad_norm": 0.3732241198757729,
+      "learning_rate": 3.023195351346797e-05,
+      "loss": 0.6407,
+      "step": 4238
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.34161327580409323,
+      "learning_rate": 3.0190713881074105e-05,
+      "loss": 0.5641,
+      "step": 4239
+    },
+    {
+      "epoch": 0.7537777777777778,
+      "grad_norm": 0.3498273968194717,
+      "learning_rate": 3.014949739405448e-05,
+      "loss": 0.5837,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7539555555555556,
+      "grad_norm": 0.38238022147716455,
+      "learning_rate": 3.010830406607441e-05,
+      "loss": 0.588,
+      "step": 4241
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.3899432688526802,
+      "learning_rate": 3.0067133910791722e-05,
+      "loss": 0.6273,
+      "step": 4242
+    },
+    {
+      "epoch": 0.7543111111111112,
+      "grad_norm": 0.38350916865529533,
+      "learning_rate": 3.002598694185631e-05,
+      "loss": 0.6152,
+      "step": 4243
+    },
+    {
+      "epoch": 0.7544888888888889,
+      "grad_norm": 0.3749368999724824,
+      "learning_rate": 2.998486317291066e-05,
+      "loss": 0.5729,
+      "step": 4244
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.36091053563070535,
+      "learning_rate": 2.9943762617589264e-05,
+      "loss": 0.5561,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7548444444444444,
+      "grad_norm": 0.36189171484542015,
+      "learning_rate": 2.990268528951923e-05,
+      "loss": 0.5456,
+      "step": 4246
+    },
+    {
+      "epoch": 0.7550222222222223,
+      "grad_norm": 0.34447403945244204,
+      "learning_rate": 2.986163120231965e-05,
+      "loss": 0.5423,
+      "step": 4247
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3429685993377397,
+      "learning_rate": 2.9820600369602224e-05,
+      "loss": 0.5674,
+      "step": 4248
+    },
+    {
+      "epoch": 0.7553777777777778,
+      "grad_norm": 0.35842641976135975,
+      "learning_rate": 2.977959280497068e-05,
+      "loss": 0.5845,
+      "step": 4249
+    },
+    {
+      "epoch": 0.7555555555555555,
+      "grad_norm": 0.36809913546801193,
+      "learning_rate": 2.9738608522021173e-05,
+      "loss": 0.6279,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.3434563800908367,
+      "learning_rate": 2.969764753434209e-05,
+      "loss": 0.5724,
+      "step": 4251
+    },
+    {
+      "epoch": 0.7559111111111111,
+      "grad_norm": 0.3784185150737185,
+      "learning_rate": 2.965670985551412e-05,
+      "loss": 0.5857,
+      "step": 4252
+    },
+    {
+      "epoch": 0.7560888888888889,
+      "grad_norm": 0.35480404145148375,
+      "learning_rate": 2.9615795499110222e-05,
+      "loss": 0.5303,
+      "step": 4253
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.39442699998781444,
+      "learning_rate": 2.9574904478695586e-05,
+      "loss": 0.5298,
+      "step": 4254
+    },
+    {
+      "epoch": 0.7564444444444445,
+      "grad_norm": 0.34309851387691853,
+      "learning_rate": 2.9534036807827726e-05,
+      "loss": 0.5789,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7566222222222222,
+      "grad_norm": 0.3335078622608469,
+      "learning_rate": 2.9493192500056345e-05,
+      "loss": 0.5448,
+      "step": 4256
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.35155594852221766,
+      "learning_rate": 2.9452371568923455e-05,
+      "loss": 0.5482,
+      "step": 4257
+    },
+    {
+      "epoch": 0.7569777777777777,
+      "grad_norm": 0.344885376604275,
+      "learning_rate": 2.9411574027963296e-05,
+      "loss": 0.5591,
+      "step": 4258
+    },
+    {
+      "epoch": 0.7571555555555556,
+      "grad_norm": 0.34105851054595154,
+      "learning_rate": 2.9370799890702362e-05,
+      "loss": 0.5607,
+      "step": 4259
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.3641676724156971,
+      "learning_rate": 2.9330049170659357e-05,
+      "loss": 0.5547,
+      "step": 4260
+    },
+    {
+      "epoch": 0.7575111111111111,
+      "grad_norm": 0.36797442473954056,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.573,
+      "step": 4261
+    },
+    {
+      "epoch": 0.7576888888888889,
+      "grad_norm": 0.373526058225425,
+      "learning_rate": 2.9248618036263255e-05,
+      "loss": 0.624,
+      "step": 4262
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.35284043969981105,
+      "learning_rate": 2.920793764890878e-05,
+      "loss": 0.5553,
+      "step": 4263
+    },
+    {
+      "epoch": 0.7580444444444444,
+      "grad_norm": 0.3508506808649338,
+      "learning_rate": 2.9167280732769463e-05,
+      "loss": 0.544,
+      "step": 4264
+    },
+    {
+      "epoch": 0.7582222222222222,
+      "grad_norm": 0.3964831825740827,
+      "learning_rate": 2.9126647301325173e-05,
+      "loss": 0.5755,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3366898115073589,
+      "learning_rate": 2.908603736804798e-05,
+      "loss": 0.6127,
+      "step": 4266
+    },
+    {
+      "epoch": 0.7585777777777778,
+      "grad_norm": 0.3588474336080617,
+      "learning_rate": 2.9045450946402175e-05,
+      "loss": 0.5849,
+      "step": 4267
+    },
+    {
+      "epoch": 0.7587555555555555,
+      "grad_norm": 0.36381022116264883,
+      "learning_rate": 2.9004888049844248e-05,
+      "loss": 0.5851,
+      "step": 4268
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.3612514999903744,
+      "learning_rate": 2.8964348691822895e-05,
+      "loss": 0.556,
+      "step": 4269
+    },
+    {
+      "epoch": 0.7591111111111111,
+      "grad_norm": 0.3583575434999154,
+      "learning_rate": 2.892383288577898e-05,
+      "loss": 0.5757,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7592888888888889,
+      "grad_norm": 0.3523042240458458,
+      "learning_rate": 2.8883340645145597e-05,
+      "loss": 0.5763,
+      "step": 4271
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.35235017432930843,
+      "learning_rate": 2.8842871983347998e-05,
+      "loss": 0.5433,
+      "step": 4272
+    },
+    {
+      "epoch": 0.7596444444444445,
+      "grad_norm": 0.3333974637428611,
+      "learning_rate": 2.8802426913803638e-05,
+      "loss": 0.5627,
+      "step": 4273
+    },
+    {
+      "epoch": 0.7598222222222222,
+      "grad_norm": 0.3813781671578012,
+      "learning_rate": 2.8762005449922147e-05,
+      "loss": 0.5742,
+      "step": 4274
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6391010007936021,
+      "learning_rate": 2.8721607605105337e-05,
+      "loss": 0.5756,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7601777777777777,
+      "grad_norm": 0.384595088211196,
+      "learning_rate": 2.8681233392747086e-05,
+      "loss": 0.5757,
+      "step": 4276
+    },
+    {
+      "epoch": 0.7603555555555556,
+      "grad_norm": 0.3556352714948297,
+      "learning_rate": 2.864088282623366e-05,
+      "loss": 0.5824,
+      "step": 4277
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.3295268869241266,
+      "learning_rate": 2.8600555918943218e-05,
+      "loss": 0.5339,
+      "step": 4278
+    },
+    {
+      "epoch": 0.7607111111111111,
+      "grad_norm": 0.37932401859673354,
+      "learning_rate": 2.8560252684246324e-05,
+      "loss": 0.5995,
+      "step": 4279
+    },
+    {
+      "epoch": 0.7608888888888888,
+      "grad_norm": 0.3480377899987346,
+      "learning_rate": 2.8519973135505462e-05,
+      "loss": 0.5371,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.3306271257294146,
+      "learning_rate": 2.8479717286075502e-05,
+      "loss": 0.5305,
+      "step": 4281
+    },
+    {
+      "epoch": 0.7612444444444444,
+      "grad_norm": 0.3618439726182244,
+      "learning_rate": 2.84394851493032e-05,
+      "loss": 0.5685,
+      "step": 4282
+    },
+    {
+      "epoch": 0.7614222222222222,
+      "grad_norm": 0.3677953744762058,
+      "learning_rate": 2.8399276738527714e-05,
+      "loss": 0.5841,
+      "step": 4283
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.37789648238799733,
+      "learning_rate": 2.8359092067080106e-05,
+      "loss": 0.6087,
+      "step": 4284
+    },
+    {
+      "epoch": 0.7617777777777778,
+      "grad_norm": 0.3463045651522611,
+      "learning_rate": 2.83189311482837e-05,
+      "loss": 0.4997,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7619555555555556,
+      "grad_norm": 0.37363212688999853,
+      "learning_rate": 2.82787939954539e-05,
+      "loss": 0.5804,
+      "step": 4286
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.4170357868104384,
+      "learning_rate": 2.823868062189825e-05,
+      "loss": 0.5486,
+      "step": 4287
+    },
+    {
+      "epoch": 0.7623111111111112,
+      "grad_norm": 0.33958013225061306,
+      "learning_rate": 2.8198591040916387e-05,
+      "loss": 0.5661,
+      "step": 4288
+    },
+    {
+      "epoch": 0.7624888888888889,
+      "grad_norm": 0.3405927889172959,
+      "learning_rate": 2.8158525265800094e-05,
+      "loss": 0.5762,
+      "step": 4289
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.34444028874973215,
+      "learning_rate": 2.811848330983321e-05,
+      "loss": 0.5701,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7628444444444444,
+      "grad_norm": 0.34313612935464455,
+      "learning_rate": 2.8078465186291724e-05,
+      "loss": 0.5984,
+      "step": 4291
+    },
+    {
+      "epoch": 0.7630222222222223,
+      "grad_norm": 0.3353216679805234,
+      "learning_rate": 2.8038470908443714e-05,
+      "loss": 0.5581,
+      "step": 4292
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.39899946565810035,
+      "learning_rate": 2.799850048954932e-05,
+      "loss": 0.576,
+      "step": 4293
+    },
+    {
+      "epoch": 0.7633777777777778,
+      "grad_norm": 0.36971140324940005,
+      "learning_rate": 2.795855394286081e-05,
+      "loss": 0.5808,
+      "step": 4294
+    },
+    {
+      "epoch": 0.7635555555555555,
+      "grad_norm": 0.3399741677234007,
+      "learning_rate": 2.791863128162251e-05,
+      "loss": 0.532,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.3296102634535836,
+      "learning_rate": 2.787873251907086e-05,
+      "loss": 0.5761,
+      "step": 4296
+    },
+    {
+      "epoch": 0.7639111111111111,
+      "grad_norm": 0.3553281966073167,
+      "learning_rate": 2.7838857668434327e-05,
+      "loss": 0.6294,
+      "step": 4297
+    },
+    {
+      "epoch": 0.7640888888888889,
+      "grad_norm": 0.3345832561474351,
+      "learning_rate": 2.779900674293351e-05,
+      "loss": 0.5623,
+      "step": 4298
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.3416907437433639,
+      "learning_rate": 2.775917975578104e-05,
+      "loss": 0.5966,
+      "step": 4299
+    },
+    {
+      "epoch": 0.7644444444444445,
+      "grad_norm": 0.3534629423549445,
+      "learning_rate": 2.7719376720181546e-05,
+      "loss": 0.6135,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7646222222222222,
+      "grad_norm": 0.3780017127778086,
+      "learning_rate": 2.76795976493319e-05,
+      "loss": 0.5498,
+      "step": 4301
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3794798968228671,
+      "learning_rate": 2.7639842556420792e-05,
+      "loss": 0.5827,
+      "step": 4302
+    },
+    {
+      "epoch": 0.7649777777777778,
+      "grad_norm": 0.357809527543104,
+      "learning_rate": 2.7600111454629207e-05,
+      "loss": 0.5473,
+      "step": 4303
+    },
+    {
+      "epoch": 0.7651555555555556,
+      "grad_norm": 0.3351256496248654,
+      "learning_rate": 2.756040435712992e-05,
+      "loss": 0.5639,
+      "step": 4304
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.3566561021838032,
+      "learning_rate": 2.7520721277088024e-05,
+      "loss": 0.5611,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7655111111111111,
+      "grad_norm": 0.3585275296411708,
+      "learning_rate": 2.7481062227660348e-05,
+      "loss": 0.5955,
+      "step": 4306
+    },
+    {
+      "epoch": 0.7656888888888889,
+      "grad_norm": 0.35075049383760293,
+      "learning_rate": 2.7441427221996065e-05,
+      "loss": 0.5567,
+      "step": 4307
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.3693363754292584,
+      "learning_rate": 2.74018162732361e-05,
+      "loss": 0.5625,
+      "step": 4308
+    },
+    {
+      "epoch": 0.7660444444444444,
+      "grad_norm": 0.3336393395709656,
+      "learning_rate": 2.7362229394513584e-05,
+      "loss": 0.5304,
+      "step": 4309
+    },
+    {
+      "epoch": 0.7662222222222222,
+      "grad_norm": 0.3337213509668497,
+      "learning_rate": 2.7322666598953574e-05,
+      "loss": 0.5339,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.3580448734708473,
+      "learning_rate": 2.72831278996732e-05,
+      "loss": 0.5871,
+      "step": 4311
+    },
+    {
+      "epoch": 0.7665777777777778,
+      "grad_norm": 0.3458516402497339,
+      "learning_rate": 2.724361330978157e-05,
+      "loss": 0.5535,
+      "step": 4312
+    },
+    {
+      "epoch": 0.7667555555555555,
+      "grad_norm": 0.3493775043646141,
+      "learning_rate": 2.72041228423798e-05,
+      "loss": 0.538,
+      "step": 4313
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.38591941572637034,
+      "learning_rate": 2.7164656510561026e-05,
+      "loss": 0.571,
+      "step": 4314
+    },
+    {
+      "epoch": 0.7671111111111111,
+      "grad_norm": 0.34755620415215,
+      "learning_rate": 2.7125214327410354e-05,
+      "loss": 0.5964,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7672888888888889,
+      "grad_norm": 0.37337456092855464,
+      "learning_rate": 2.7085796306004906e-05,
+      "loss": 0.577,
+      "step": 4316
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.3393712759076397,
+      "learning_rate": 2.70464024594138e-05,
+      "loss": 0.5622,
+      "step": 4317
+    },
+    {
+      "epoch": 0.7676444444444445,
+      "grad_norm": 0.3472721818167264,
+      "learning_rate": 2.7007032800698105e-05,
+      "loss": 0.5587,
+      "step": 4318
+    },
+    {
+      "epoch": 0.7678222222222222,
+      "grad_norm": 0.33382249429187827,
+      "learning_rate": 2.6967687342910898e-05,
+      "loss": 0.5763,
+      "step": 4319
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3345773200547266,
+      "learning_rate": 2.6928366099097235e-05,
+      "loss": 0.5203,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7681777777777777,
+      "grad_norm": 0.36781694157511413,
+      "learning_rate": 2.6889069082294114e-05,
+      "loss": 0.586,
+      "step": 4321
+    },
+    {
+      "epoch": 0.7683555555555556,
+      "grad_norm": 0.37297300973136055,
+      "learning_rate": 2.6849796305530538e-05,
+      "loss": 0.5745,
+      "step": 4322
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.3556462465455715,
+      "learning_rate": 2.681054778182748e-05,
+      "loss": 0.5866,
+      "step": 4323
+    },
+    {
+      "epoch": 0.7687111111111111,
+      "grad_norm": 0.3484887184658991,
+      "learning_rate": 2.6771323524197756e-05,
+      "loss": 0.5885,
+      "step": 4324
+    },
+    {
+      "epoch": 0.7688888888888888,
+      "grad_norm": 0.35259251401627323,
+      "learning_rate": 2.6732123545646347e-05,
+      "loss": 0.5827,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.35727181634412014,
+      "learning_rate": 2.669294785916995e-05,
+      "loss": 0.5419,
+      "step": 4326
+    },
+    {
+      "epoch": 0.7692444444444444,
+      "grad_norm": 0.4038358436186105,
+      "learning_rate": 2.6653796477757432e-05,
+      "loss": 0.5643,
+      "step": 4327
+    },
+    {
+      "epoch": 0.7694222222222222,
+      "grad_norm": 0.3348318890939429,
+      "learning_rate": 2.661466941438938e-05,
+      "loss": 0.5379,
+      "step": 4328
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.3314200102478855,
+      "learning_rate": 2.6575566682038556e-05,
+      "loss": 0.5795,
+      "step": 4329
+    },
+    {
+      "epoch": 0.7697777777777778,
+      "grad_norm": 0.34230045633343376,
+      "learning_rate": 2.6536488293669392e-05,
+      "loss": 0.5316,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7699555555555555,
+      "grad_norm": 0.34712692150174207,
+      "learning_rate": 2.649743426223853e-05,
+      "loss": 0.5689,
+      "step": 4331
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.3637005118566602,
+      "learning_rate": 2.6458404600694263e-05,
+      "loss": 0.5732,
+      "step": 4332
+    },
+    {
+      "epoch": 0.7703111111111111,
+      "grad_norm": 0.3409605257991041,
+      "learning_rate": 2.6419399321977058e-05,
+      "loss": 0.5673,
+      "step": 4333
+    },
+    {
+      "epoch": 0.7704888888888889,
+      "grad_norm": 0.34102074326844156,
+      "learning_rate": 2.6380418439019062e-05,
+      "loss": 0.5411,
+      "step": 4334
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.41449092255940173,
+      "learning_rate": 2.6341461964744508e-05,
+      "loss": 0.558,
+      "step": 4335
+    },
+    {
+      "epoch": 0.7708444444444444,
+      "grad_norm": 0.35129929624361794,
+      "learning_rate": 2.6302529912069452e-05,
+      "loss": 0.5214,
+      "step": 4336
+    },
+    {
+      "epoch": 0.7710222222222223,
+      "grad_norm": 0.3440983635006046,
+      "learning_rate": 2.626362229390189e-05,
+      "loss": 0.558,
+      "step": 4337
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.367434874537577,
+      "learning_rate": 2.6224739123141684e-05,
+      "loss": 0.5552,
+      "step": 4338
+    },
+    {
+      "epoch": 0.7713777777777778,
+      "grad_norm": 0.3479638399219766,
+      "learning_rate": 2.618588041268063e-05,
+      "loss": 0.5925,
+      "step": 4339
+    },
+    {
+      "epoch": 0.7715555555555556,
+      "grad_norm": 0.36234312790855217,
+      "learning_rate": 2.6147046175402368e-05,
+      "loss": 0.5792,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.3348044773849246,
+      "learning_rate": 2.6108236424182465e-05,
+      "loss": 0.5454,
+      "step": 4341
+    },
+    {
+      "epoch": 0.7719111111111111,
+      "grad_norm": 0.36110873069662,
+      "learning_rate": 2.6069451171888336e-05,
+      "loss": 0.597,
+      "step": 4342
+    },
+    {
+      "epoch": 0.7720888888888889,
+      "grad_norm": 0.3653360457710555,
+      "learning_rate": 2.6030690431379312e-05,
+      "loss": 0.5769,
+      "step": 4343
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.36745737378190635,
+      "learning_rate": 2.5991954215506563e-05,
+      "loss": 0.5943,
+      "step": 4344
+    },
+    {
+      "epoch": 0.7724444444444445,
+      "grad_norm": 0.35139256054606194,
+      "learning_rate": 2.5953242537113142e-05,
+      "loss": 0.5625,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7726222222222222,
+      "grad_norm": 0.3727655064714463,
+      "learning_rate": 2.591455540903397e-05,
+      "loss": 0.5547,
+      "step": 4346
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.3649092121675021,
+      "learning_rate": 2.587589284409583e-05,
+      "loss": 0.6164,
+      "step": 4347
+    },
+    {
+      "epoch": 0.7729777777777778,
+      "grad_norm": 0.36540280578138495,
+      "learning_rate": 2.583725485511729e-05,
+      "loss": 0.5565,
+      "step": 4348
+    },
+    {
+      "epoch": 0.7731555555555556,
+      "grad_norm": 0.340857849254035,
+      "learning_rate": 2.5798641454908944e-05,
+      "loss": 0.5985,
+      "step": 4349
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.4160572957314163,
+      "learning_rate": 2.5760052656273002e-05,
+      "loss": 0.558,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7735111111111111,
+      "grad_norm": 0.361466601989381,
+      "learning_rate": 2.572148847200375e-05,
+      "loss": 0.6073,
+      "step": 4351
+    },
+    {
+      "epoch": 0.7736888888888889,
+      "grad_norm": 0.33777502252947933,
+      "learning_rate": 2.5682948914887106e-05,
+      "loss": 0.5166,
+      "step": 4352
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.7659817761463893,
+      "learning_rate": 2.564443399770101e-05,
+      "loss": 0.5562,
+      "step": 4353
+    },
+    {
+      "epoch": 0.7740444444444444,
+      "grad_norm": 0.3576573310205456,
+      "learning_rate": 2.5605943733215042e-05,
+      "loss": 0.5464,
+      "step": 4354
+    },
+    {
+      "epoch": 0.7742222222222223,
+      "grad_norm": 0.3488566127820092,
+      "learning_rate": 2.5567478134190824e-05,
+      "loss": 0.5681,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.33697213028749157,
+      "learning_rate": 2.5529037213381545e-05,
+      "loss": 0.5372,
+      "step": 4356
+    },
+    {
+      "epoch": 0.7745777777777778,
+      "grad_norm": 0.3347178334244202,
+      "learning_rate": 2.5490620983532497e-05,
+      "loss": 0.5966,
+      "step": 4357
+    },
+    {
+      "epoch": 0.7747555555555555,
+      "grad_norm": 0.3862298275640225,
+      "learning_rate": 2.545222945738053e-05,
+      "loss": 0.5907,
+      "step": 4358
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.379651023808002,
+      "learning_rate": 2.541386264765444e-05,
+      "loss": 0.6125,
+      "step": 4359
+    },
+    {
+      "epoch": 0.7751111111111111,
+      "grad_norm": 0.3518332473303472,
+      "learning_rate": 2.537552056707483e-05,
+      "loss": 0.5513,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7752888888888889,
+      "grad_norm": 0.358073703256899,
+      "learning_rate": 2.5337203228354035e-05,
+      "loss": 0.5361,
+      "step": 4361
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.36338966574155696,
+      "learning_rate": 2.529891064419625e-05,
+      "loss": 0.5621,
+      "step": 4362
+    },
+    {
+      "epoch": 0.7756444444444445,
+      "grad_norm": 0.35607301106378386,
+      "learning_rate": 2.5260642827297444e-05,
+      "loss": 0.5564,
+      "step": 4363
+    },
+    {
+      "epoch": 0.7758222222222222,
+      "grad_norm": 0.33652985093432897,
+      "learning_rate": 2.5222399790345354e-05,
+      "loss": 0.5385,
+      "step": 4364
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3457816274153398,
+      "learning_rate": 2.5184181546019515e-05,
+      "loss": 0.553,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7761777777777777,
+      "grad_norm": 0.34582483496806654,
+      "learning_rate": 2.514598810699126e-05,
+      "loss": 0.5392,
+      "step": 4366
+    },
+    {
+      "epoch": 0.7763555555555556,
+      "grad_norm": 0.3527195791056369,
+      "learning_rate": 2.5107819485923668e-05,
+      "loss": 0.5961,
+      "step": 4367
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.37363506435994237,
+      "learning_rate": 2.5069675695471617e-05,
+      "loss": 0.5517,
+      "step": 4368
+    },
+    {
+      "epoch": 0.7767111111111111,
+      "grad_norm": 0.36456307277282124,
+      "learning_rate": 2.5031556748281715e-05,
+      "loss": 0.5622,
+      "step": 4369
+    },
+    {
+      "epoch": 0.7768888888888889,
+      "grad_norm": 0.3335076885305479,
+      "learning_rate": 2.4993462656992384e-05,
+      "loss": 0.5848,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.3520408549118235,
+      "learning_rate": 2.4955393434233754e-05,
+      "loss": 0.5585,
+      "step": 4371
+    },
+    {
+      "epoch": 0.7772444444444444,
+      "grad_norm": 0.35848023015856245,
+      "learning_rate": 2.4917349092627752e-05,
+      "loss": 0.6004,
+      "step": 4372
+    },
+    {
+      "epoch": 0.7774222222222222,
+      "grad_norm": 0.34380938237682085,
+      "learning_rate": 2.4879329644788053e-05,
+      "loss": 0.5771,
+      "step": 4373
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3743374585688741,
+      "learning_rate": 2.4841335103319972e-05,
+      "loss": 0.5436,
+      "step": 4374
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 0.3390044209994759,
+      "learning_rate": 2.4803365480820785e-05,
+      "loss": 0.5625,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7779555555555555,
+      "grad_norm": 0.3652287624288249,
+      "learning_rate": 2.4765420789879257e-05,
+      "loss": 0.6076,
+      "step": 4376
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.3631401587485521,
+      "learning_rate": 2.4727501043076128e-05,
+      "loss": 0.6241,
+      "step": 4377
+    },
+    {
+      "epoch": 0.7783111111111111,
+      "grad_norm": 0.39266179299732584,
+      "learning_rate": 2.4689606252983623e-05,
+      "loss": 0.5936,
+      "step": 4378
+    },
+    {
+      "epoch": 0.7784888888888889,
+      "grad_norm": 0.3390134965453679,
+      "learning_rate": 2.465173643216594e-05,
+      "loss": 0.5304,
+      "step": 4379
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.36430903628025196,
+      "learning_rate": 2.4613891593178752e-05,
+      "loss": 0.5727,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7788444444444445,
+      "grad_norm": 0.36216824153035626,
+      "learning_rate": 2.4576071748569695e-05,
+      "loss": 0.6094,
+      "step": 4381
+    },
+    {
+      "epoch": 0.7790222222222222,
+      "grad_norm": 0.3594210930056404,
+      "learning_rate": 2.45382769108779e-05,
+      "loss": 0.5745,
+      "step": 4382
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.35199554147819706,
+      "learning_rate": 2.4500507092634338e-05,
+      "loss": 0.5746,
+      "step": 4383
+    },
+    {
+      "epoch": 0.7793777777777777,
+      "grad_norm": 0.3675465305250382,
+      "learning_rate": 2.4462762306361654e-05,
+      "loss": 0.5677,
+      "step": 4384
+    },
+    {
+      "epoch": 0.7795555555555556,
+      "grad_norm": 0.39778150996015027,
+      "learning_rate": 2.4425042564574184e-05,
+      "loss": 0.6007,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.341386610907,
+      "learning_rate": 2.4387347879777955e-05,
+      "loss": 0.5412,
+      "step": 4386
+    },
+    {
+      "epoch": 0.7799111111111111,
+      "grad_norm": 0.36750685066754046,
+      "learning_rate": 2.434967826447072e-05,
+      "loss": 0.5777,
+      "step": 4387
+    },
+    {
+      "epoch": 0.7800888888888889,
+      "grad_norm": 0.3492894710233438,
+      "learning_rate": 2.431203373114187e-05,
+      "loss": 0.5351,
+      "step": 4388
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.37279206195034853,
+      "learning_rate": 2.427441429227253e-05,
+      "loss": 0.5897,
+      "step": 4389
+    },
+    {
+      "epoch": 0.7804444444444445,
+      "grad_norm": 0.37704310937474866,
+      "learning_rate": 2.4236819960335476e-05,
+      "loss": 0.6039,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7806222222222222,
+      "grad_norm": 0.3341105513119436,
+      "learning_rate": 2.4199250747795154e-05,
+      "loss": 0.5557,
+      "step": 4391
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.36680834655227373,
+      "learning_rate": 2.41617066671077e-05,
+      "loss": 0.6013,
+      "step": 4392
+    },
+    {
+      "epoch": 0.7809777777777778,
+      "grad_norm": 0.34670883620130083,
+      "learning_rate": 2.4124187730720917e-05,
+      "loss": 0.5573,
+      "step": 4393
+    },
+    {
+      "epoch": 0.7811555555555556,
+      "grad_norm": 0.3569936970649313,
+      "learning_rate": 2.4086693951074247e-05,
+      "loss": 0.5742,
+      "step": 4394
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.3478642969035027,
+      "learning_rate": 2.4049225340598835e-05,
+      "loss": 0.5634,
+      "step": 4395
+    },
+    {
+      "epoch": 0.7815111111111112,
+      "grad_norm": 0.3828596888038592,
+      "learning_rate": 2.4011781911717436e-05,
+      "loss": 0.568,
+      "step": 4396
+    },
+    {
+      "epoch": 0.7816888888888889,
+      "grad_norm": 0.4140886851492242,
+      "learning_rate": 2.3974363676844503e-05,
+      "loss": 0.5715,
+      "step": 4397
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.3707233917101248,
+      "learning_rate": 2.3936970648386038e-05,
+      "loss": 0.6198,
+      "step": 4398
+    },
+    {
+      "epoch": 0.7820444444444444,
+      "grad_norm": 0.34824057998777835,
+      "learning_rate": 2.3899602838739864e-05,
+      "loss": 0.553,
+      "step": 4399
+    },
+    {
+      "epoch": 0.7822222222222223,
+      "grad_norm": 0.3529386332257371,
+      "learning_rate": 2.386226026029521e-05,
+      "loss": 0.5924,
+      "step": 4400
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.34867121025093356,
+      "learning_rate": 2.382494292543319e-05,
+      "loss": 0.5934,
+      "step": 4401
+    },
+    {
+      "epoch": 0.7825777777777778,
+      "grad_norm": 0.35736787093786476,
+      "learning_rate": 2.378765084652631e-05,
+      "loss": 0.6343,
+      "step": 4402
+    },
+    {
+      "epoch": 0.7827555555555555,
+      "grad_norm": 0.5891866621211168,
+      "learning_rate": 2.3750384035938922e-05,
+      "loss": 0.5611,
+      "step": 4403
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.3528181470032474,
+      "learning_rate": 2.3713142506026786e-05,
+      "loss": 0.5825,
+      "step": 4404
+    },
+    {
+      "epoch": 0.7831111111111111,
+      "grad_norm": 0.358480746142133,
+      "learning_rate": 2.3675926269137495e-05,
+      "loss": 0.5968,
+      "step": 4405
+    },
+    {
+      "epoch": 0.7832888888888889,
+      "grad_norm": 0.355470594033632,
+      "learning_rate": 2.363873533761005e-05,
+      "loss": 0.5891,
+      "step": 4406
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.33530432665158394,
+      "learning_rate": 2.360156972377522e-05,
+      "loss": 0.5421,
+      "step": 4407
+    },
+    {
+      "epoch": 0.7836444444444445,
+      "grad_norm": 0.335572489127656,
+      "learning_rate": 2.3564429439955303e-05,
+      "loss": 0.5477,
+      "step": 4408
+    },
+    {
+      "epoch": 0.7838222222222222,
+      "grad_norm": 0.4286231148471792,
+      "learning_rate": 2.3527314498464215e-05,
+      "loss": 0.5755,
+      "step": 4409
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.36012731462924885,
+      "learning_rate": 2.3490224911607473e-05,
+      "loss": 0.5955,
+      "step": 4410
+    },
+    {
+      "epoch": 0.7841777777777778,
+      "grad_norm": 0.3485587472638476,
+      "learning_rate": 2.3453160691682197e-05,
+      "loss": 0.5263,
+      "step": 4411
+    },
+    {
+      "epoch": 0.7843555555555556,
+      "grad_norm": 0.3578893285300129,
+      "learning_rate": 2.3416121850977056e-05,
+      "loss": 0.5919,
+      "step": 4412
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.35010437123247085,
+      "learning_rate": 2.3379108401772365e-05,
+      "loss": 0.6142,
+      "step": 4413
+    },
+    {
+      "epoch": 0.7847111111111111,
+      "grad_norm": 0.3816677020835402,
+      "learning_rate": 2.334212035633997e-05,
+      "loss": 0.5826,
+      "step": 4414
+    },
+    {
+      "epoch": 0.7848888888888889,
+      "grad_norm": 0.354311571518565,
+      "learning_rate": 2.3305157726943327e-05,
+      "loss": 0.5465,
+      "step": 4415
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.3994530359459713,
+      "learning_rate": 2.3268220525837437e-05,
+      "loss": 0.6093,
+      "step": 4416
+    },
+    {
+      "epoch": 0.7852444444444444,
+      "grad_norm": 0.34590691159766396,
+      "learning_rate": 2.3231308765268888e-05,
+      "loss": 0.5938,
+      "step": 4417
+    },
+    {
+      "epoch": 0.7854222222222222,
+      "grad_norm": 0.34528876416571636,
+      "learning_rate": 2.319442245747584e-05,
+      "loss": 0.564,
+      "step": 4418
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.36273041096313324,
+      "learning_rate": 2.3157561614687995e-05,
+      "loss": 0.5657,
+      "step": 4419
+    },
+    {
+      "epoch": 0.7857777777777778,
+      "grad_norm": 0.3627294462984221,
+      "learning_rate": 2.312072624912662e-05,
+      "loss": 0.5636,
+      "step": 4420
+    },
+    {
+      "epoch": 0.7859555555555555,
+      "grad_norm": 0.33780571528960485,
+      "learning_rate": 2.3083916373004577e-05,
+      "loss": 0.5602,
+      "step": 4421
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.3300356177276644,
+      "learning_rate": 2.3047131998526138e-05,
+      "loss": 0.561,
+      "step": 4422
+    },
+    {
+      "epoch": 0.7863111111111111,
+      "grad_norm": 0.3498848742243878,
+      "learning_rate": 2.301037313788733e-05,
+      "loss": 0.5564,
+      "step": 4423
+    },
+    {
+      "epoch": 0.7864888888888889,
+      "grad_norm": 0.37315436482006004,
+      "learning_rate": 2.2973639803275503e-05,
+      "loss": 0.5794,
+      "step": 4424
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.3518328979317764,
+      "learning_rate": 2.293693200686976e-05,
+      "loss": 0.533,
+      "step": 4425
+    },
+    {
+      "epoch": 0.7868444444444445,
+      "grad_norm": 0.3508973530782063,
+      "learning_rate": 2.290024976084052e-05,
+      "loss": 0.5176,
+      "step": 4426
+    },
+    {
+      "epoch": 0.7870222222222222,
+      "grad_norm": 0.33838435672805656,
+      "learning_rate": 2.2863593077349944e-05,
+      "loss": 0.5472,
+      "step": 4427
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4071923342645993,
+      "learning_rate": 2.2826961968551486e-05,
+      "loss": 0.5441,
+      "step": 4428
+    },
+    {
+      "epoch": 0.7873777777777777,
+      "grad_norm": 0.35679042741907685,
+      "learning_rate": 2.2790356446590377e-05,
+      "loss": 0.5634,
+      "step": 4429
+    },
+    {
+      "epoch": 0.7875555555555556,
+      "grad_norm": 0.33146076529201013,
+      "learning_rate": 2.275377652360312e-05,
+      "loss": 0.5218,
+      "step": 4430
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.34986509909729746,
+      "learning_rate": 2.2717222211717935e-05,
+      "loss": 0.5217,
+      "step": 4431
+    },
+    {
+      "epoch": 0.7879111111111111,
+      "grad_norm": 0.3430071867379447,
+      "learning_rate": 2.2680693523054407e-05,
+      "loss": 0.549,
+      "step": 4432
+    },
+    {
+      "epoch": 0.7880888888888888,
+      "grad_norm": 0.3529690519671126,
+      "learning_rate": 2.264419046972368e-05,
+      "loss": 0.5343,
+      "step": 4433
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.33175317404133,
+      "learning_rate": 2.2607713063828394e-05,
+      "loss": 0.5063,
+      "step": 4434
+    },
+    {
+      "epoch": 0.7884444444444444,
+      "grad_norm": 0.3653405417984674,
+      "learning_rate": 2.2571261317462712e-05,
+      "loss": 0.5478,
+      "step": 4435
+    },
+    {
+      "epoch": 0.7886222222222222,
+      "grad_norm": 0.3520472676591341,
+      "learning_rate": 2.253483524271225e-05,
+      "loss": 0.6156,
+      "step": 4436
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4013144518995045,
+      "learning_rate": 2.2498434851654126e-05,
+      "loss": 0.5401,
+      "step": 4437
+    },
+    {
+      "epoch": 0.7889777777777778,
+      "grad_norm": 0.34867366665252647,
+      "learning_rate": 2.2462060156356956e-05,
+      "loss": 0.5148,
+      "step": 4438
+    },
+    {
+      "epoch": 0.7891555555555556,
+      "grad_norm": 0.3658504765354854,
+      "learning_rate": 2.2425711168880814e-05,
+      "loss": 0.5599,
+      "step": 4439
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.3778239547034521,
+      "learning_rate": 2.238938790127727e-05,
+      "loss": 0.5778,
+      "step": 4440
+    },
+    {
+      "epoch": 0.7895111111111112,
+      "grad_norm": 0.34462757856381065,
+      "learning_rate": 2.2353090365589348e-05,
+      "loss": 0.5594,
+      "step": 4441
+    },
+    {
+      "epoch": 0.7896888888888889,
+      "grad_norm": 0.3513516798827274,
+      "learning_rate": 2.2316818573851563e-05,
+      "loss": 0.6061,
+      "step": 4442
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.3836923635197959,
+      "learning_rate": 2.2280572538089872e-05,
+      "loss": 0.5424,
+      "step": 4443
+    },
+    {
+      "epoch": 0.7900444444444444,
+      "grad_norm": 0.36243972197616764,
+      "learning_rate": 2.224435227032171e-05,
+      "loss": 0.5948,
+      "step": 4444
+    },
+    {
+      "epoch": 0.7902222222222223,
+      "grad_norm": 0.34519664165156827,
+      "learning_rate": 2.220815778255596e-05,
+      "loss": 0.5423,
+      "step": 4445
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3537625173251878,
+      "learning_rate": 2.2171989086792956e-05,
+      "loss": 0.6044,
+      "step": 4446
+    },
+    {
+      "epoch": 0.7905777777777778,
+      "grad_norm": 0.37719169929058854,
+      "learning_rate": 2.2135846195024513e-05,
+      "loss": 0.5711,
+      "step": 4447
+    },
+    {
+      "epoch": 0.7907555555555555,
+      "grad_norm": 0.3487790925801628,
+      "learning_rate": 2.209972911923377e-05,
+      "loss": 0.5417,
+      "step": 4448
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.34635768874747586,
+      "learning_rate": 2.2063637871395527e-05,
+      "loss": 0.5523,
+      "step": 4449
+    },
+    {
+      "epoch": 0.7911111111111111,
+      "grad_norm": 0.34840796124579476,
+      "learning_rate": 2.2027572463475764e-05,
+      "loss": 0.5756,
+      "step": 4450
+    },
+    {
+      "epoch": 0.7912888888888889,
+      "grad_norm": 0.350523468511347,
+      "learning_rate": 2.1991532907432145e-05,
+      "loss": 0.5587,
+      "step": 4451
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.3508473901939546,
+      "learning_rate": 2.1955519215213527e-05,
+      "loss": 0.5143,
+      "step": 4452
+    },
+    {
+      "epoch": 0.7916444444444445,
+      "grad_norm": 0.344343383859007,
+      "learning_rate": 2.1919531398760408e-05,
+      "loss": 0.5938,
+      "step": 4453
+    },
+    {
+      "epoch": 0.7918222222222222,
+      "grad_norm": 0.3725443311322793,
+      "learning_rate": 2.1883569470004485e-05,
+      "loss": 0.5852,
+      "step": 4454
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.35888135186522996,
+      "learning_rate": 2.184763344086912e-05,
+      "loss": 0.5959,
+      "step": 4455
+    },
+    {
+      "epoch": 0.7921777777777778,
+      "grad_norm": 0.359225900311012,
+      "learning_rate": 2.1811723323268863e-05,
+      "loss": 0.6027,
+      "step": 4456
+    },
+    {
+      "epoch": 0.7923555555555556,
+      "grad_norm": 0.333395479636865,
+      "learning_rate": 2.177583912910979e-05,
+      "loss": 0.5632,
+      "step": 4457
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.3278509639443351,
+      "learning_rate": 2.173998087028938e-05,
+      "loss": 0.5462,
+      "step": 4458
+    },
+    {
+      "epoch": 0.7927111111111111,
+      "grad_norm": 0.3479318303442947,
+      "learning_rate": 2.170414855869647e-05,
+      "loss": 0.5633,
+      "step": 4459
+    },
+    {
+      "epoch": 0.7928888888888889,
+      "grad_norm": 0.35204816863943117,
+      "learning_rate": 2.1668342206211322e-05,
+      "loss": 0.5601,
+      "step": 4460
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.3520211187152833,
+      "learning_rate": 2.16325618247056e-05,
+      "loss": 0.5489,
+      "step": 4461
+    },
+    {
+      "epoch": 0.7932444444444444,
+      "grad_norm": 0.368070055178014,
+      "learning_rate": 2.159680742604234e-05,
+      "loss": 0.5655,
+      "step": 4462
+    },
+    {
+      "epoch": 0.7934222222222223,
+      "grad_norm": 0.3408177353118567,
+      "learning_rate": 2.1561079022075947e-05,
+      "loss": 0.5553,
+      "step": 4463
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.35001564237225685,
+      "learning_rate": 2.152537662465226e-05,
+      "loss": 0.5618,
+      "step": 4464
+    },
+    {
+      "epoch": 0.7937777777777778,
+      "grad_norm": 0.3670652395227683,
+      "learning_rate": 2.1489700245608437e-05,
+      "loss": 0.5728,
+      "step": 4465
+    },
+    {
+      "epoch": 0.7939555555555555,
+      "grad_norm": 0.35997823203905266,
+      "learning_rate": 2.145404989677303e-05,
+      "loss": 0.54,
+      "step": 4466
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.36123954651578766,
+      "learning_rate": 2.1418425589965996e-05,
+      "loss": 0.5568,
+      "step": 4467
+    },
+    {
+      "epoch": 0.7943111111111111,
+      "grad_norm": 0.3359671692245637,
+      "learning_rate": 2.1382827336998602e-05,
+      "loss": 0.5395,
+      "step": 4468
+    },
+    {
+      "epoch": 0.7944888888888889,
+      "grad_norm": 0.3368593471730129,
+      "learning_rate": 2.1347255149673505e-05,
+      "loss": 0.5389,
+      "step": 4469
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.3534071868639474,
+      "learning_rate": 2.1311709039784734e-05,
+      "loss": 0.5515,
+      "step": 4470
+    },
+    {
+      "epoch": 0.7948444444444445,
+      "grad_norm": 0.3748811392799846,
+      "learning_rate": 2.1276189019117677e-05,
+      "loss": 0.5618,
+      "step": 4471
+    },
+    {
+      "epoch": 0.7950222222222222,
+      "grad_norm": 0.3360369108761957,
+      "learning_rate": 2.1240695099448947e-05,
+      "loss": 0.5413,
+      "step": 4472
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3601178206035112,
+      "learning_rate": 2.1205227292546747e-05,
+      "loss": 0.5686,
+      "step": 4473
+    },
+    {
+      "epoch": 0.7953777777777777,
+      "grad_norm": 0.40071102305441203,
+      "learning_rate": 2.1169785610170356e-05,
+      "loss": 0.6048,
+      "step": 4474
+    },
+    {
+      "epoch": 0.7955555555555556,
+      "grad_norm": 0.32914596369978133,
+      "learning_rate": 2.113437006407062e-05,
+      "loss": 0.535,
+      "step": 4475
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.33948756585130024,
+      "learning_rate": 2.1098980665989532e-05,
+      "loss": 0.5461,
+      "step": 4476
+    },
+    {
+      "epoch": 0.7959111111111111,
+      "grad_norm": 1.6253968183443543,
+      "learning_rate": 2.1063617427660575e-05,
+      "loss": 0.5511,
+      "step": 4477
+    },
+    {
+      "epoch": 0.7960888888888888,
+      "grad_norm": 0.33518783280188064,
+      "learning_rate": 2.1028280360808407e-05,
+      "loss": 0.516,
+      "step": 4478
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.33894736764834865,
+      "learning_rate": 2.0992969477149183e-05,
+      "loss": 0.5607,
+      "step": 4479
+    },
+    {
+      "epoch": 0.7964444444444444,
+      "grad_norm": 0.3362547070357395,
+      "learning_rate": 2.0957684788390187e-05,
+      "loss": 0.5588,
+      "step": 4480
+    },
+    {
+      "epoch": 0.7966222222222222,
+      "grad_norm": 0.36097925977397793,
+      "learning_rate": 2.092242630623016e-05,
+      "loss": 0.6071,
+      "step": 4481
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3672843359144088,
+      "learning_rate": 2.0887194042359083e-05,
+      "loss": 0.5533,
+      "step": 4482
+    },
+    {
+      "epoch": 0.7969777777777778,
+      "grad_norm": 0.36099009906248897,
+      "learning_rate": 2.0851988008458278e-05,
+      "loss": 0.5738,
+      "step": 4483
+    },
+    {
+      "epoch": 0.7971555555555555,
+      "grad_norm": 0.33563502326317596,
+      "learning_rate": 2.0816808216200358e-05,
+      "loss": 0.5555,
+      "step": 4484
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.3819935557582957,
+      "learning_rate": 2.078165467724924e-05,
+      "loss": 0.602,
+      "step": 4485
+    },
+    {
+      "epoch": 0.7975111111111111,
+      "grad_norm": 0.3716518783836378,
+      "learning_rate": 2.074652740326013e-05,
+      "loss": 0.5997,
+      "step": 4486
+    },
+    {
+      "epoch": 0.7976888888888889,
+      "grad_norm": 0.359379858939067,
+      "learning_rate": 2.071142640587952e-05,
+      "loss": 0.5673,
+      "step": 4487
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.3369182512690324,
+      "learning_rate": 2.06763516967452e-05,
+      "loss": 0.6033,
+      "step": 4488
+    },
+    {
+      "epoch": 0.7980444444444444,
+      "grad_norm": 0.3737307299317438,
+      "learning_rate": 2.064130328748626e-05,
+      "loss": 0.5042,
+      "step": 4489
+    },
+    {
+      "epoch": 0.7982222222222223,
+      "grad_norm": 0.3531128111052518,
+      "learning_rate": 2.060628118972303e-05,
+      "loss": 0.5788,
+      "step": 4490
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.3443491998389885,
+      "learning_rate": 2.0571285415067164e-05,
+      "loss": 0.5528,
+      "step": 4491
+    },
+    {
+      "epoch": 0.7985777777777778,
+      "grad_norm": 0.3538940388148492,
+      "learning_rate": 2.0536315975121544e-05,
+      "loss": 0.582,
+      "step": 4492
+    },
+    {
+      "epoch": 0.7987555555555556,
+      "grad_norm": 0.3615864338705043,
+      "learning_rate": 2.050137288148035e-05,
+      "loss": 0.5618,
+      "step": 4493
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.37572026154617716,
+      "learning_rate": 2.0466456145729007e-05,
+      "loss": 0.566,
+      "step": 4494
+    },
+    {
+      "epoch": 0.7991111111111111,
+      "grad_norm": 0.35576437549087414,
+      "learning_rate": 2.043156577944425e-05,
+      "loss": 0.5694,
+      "step": 4495
+    },
+    {
+      "epoch": 0.7992888888888889,
+      "grad_norm": 0.38746520990094374,
+      "learning_rate": 2.039670179419395e-05,
+      "loss": 0.6312,
+      "step": 4496
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.37006287789317643,
+      "learning_rate": 2.036186420153743e-05,
+      "loss": 0.5911,
+      "step": 4497
+    },
+    {
+      "epoch": 0.7996444444444445,
+      "grad_norm": 0.35691504053422074,
+      "learning_rate": 2.032705301302501e-05,
+      "loss": 0.5755,
+      "step": 4498
+    },
+    {
+      "epoch": 0.7998222222222222,
+      "grad_norm": 0.42304185796233584,
+      "learning_rate": 2.029226824019853e-05,
+      "loss": 0.5525,
+      "step": 4499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.33026419052579314,
+      "learning_rate": 2.025750989459081e-05,
+      "loss": 0.5484,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8001777777777778,
+      "grad_norm": 0.3461808966618599,
+      "learning_rate": 2.022277798772614e-05,
+      "loss": 0.5496,
+      "step": 4501
+    },
+    {
+      "epoch": 0.8003555555555556,
+      "grad_norm": 0.4358772247852598,
+      "learning_rate": 2.018807253111984e-05,
+      "loss": 0.5523,
+      "step": 4502
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.35519831242472266,
+      "learning_rate": 2.0153393536278653e-05,
+      "loss": 0.5928,
+      "step": 4503
+    },
+    {
+      "epoch": 0.8007111111111112,
+      "grad_norm": 0.333958091319544,
+      "learning_rate": 2.0118741014700372e-05,
+      "loss": 0.5458,
+      "step": 4504
+    },
+    {
+      "epoch": 0.8008888888888889,
+      "grad_norm": 0.740108692807291,
+      "learning_rate": 2.0084114977874135e-05,
+      "loss": 0.5655,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.3508957376837852,
+      "learning_rate": 2.004951543728023e-05,
+      "loss": 0.5666,
+      "step": 4506
+    },
+    {
+      "epoch": 0.8012444444444444,
+      "grad_norm": 0.3360677832305794,
+      "learning_rate": 2.0014942404390214e-05,
+      "loss": 0.5376,
+      "step": 4507
+    },
+    {
+      "epoch": 0.8014222222222223,
+      "grad_norm": 0.3467867005964346,
+      "learning_rate": 1.99803958906668e-05,
+      "loss": 0.528,
+      "step": 4508
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.3712939416264039,
+      "learning_rate": 1.994587590756397e-05,
+      "loss": 0.6204,
+      "step": 4509
+    },
+    {
+      "epoch": 0.8017777777777778,
+      "grad_norm": 0.35065132253225106,
+      "learning_rate": 1.991138246652685e-05,
+      "loss": 0.5979,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8019555555555555,
+      "grad_norm": 0.3396135587110741,
+      "learning_rate": 1.9876915578991808e-05,
+      "loss": 0.5266,
+      "step": 4511
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.3568987090779943,
+      "learning_rate": 1.9842475256386384e-05,
+      "loss": 0.5692,
+      "step": 4512
+    },
+    {
+      "epoch": 0.8023111111111111,
+      "grad_norm": 0.356532759147842,
+      "learning_rate": 1.9808061510129317e-05,
+      "loss": 0.5551,
+      "step": 4513
+    },
+    {
+      "epoch": 0.8024888888888889,
+      "grad_norm": 0.4643125194046535,
+      "learning_rate": 1.9773674351630545e-05,
+      "loss": 0.5584,
+      "step": 4514
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.35714688138901224,
+      "learning_rate": 1.973931379229118e-05,
+      "loss": 0.5629,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8028444444444445,
+      "grad_norm": 0.4130592376490037,
+      "learning_rate": 1.970497984350351e-05,
+      "loss": 0.5994,
+      "step": 4516
+    },
+    {
+      "epoch": 0.8030222222222222,
+      "grad_norm": 0.34797770338119544,
+      "learning_rate": 1.967067251665101e-05,
+      "loss": 0.5304,
+      "step": 4517
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.33840676355723787,
+      "learning_rate": 1.9636391823108335e-05,
+      "loss": 0.5841,
+      "step": 4518
+    },
+    {
+      "epoch": 0.8033777777777777,
+      "grad_norm": 0.32757421432600875,
+      "learning_rate": 1.9602137774241326e-05,
+      "loss": 0.5391,
+      "step": 4519
+    },
+    {
+      "epoch": 0.8035555555555556,
+      "grad_norm": 0.39733954575909924,
+      "learning_rate": 1.9567910381406875e-05,
+      "loss": 0.5517,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.34193620603567504,
+      "learning_rate": 1.9533709655953235e-05,
+      "loss": 0.5303,
+      "step": 4521
+    },
+    {
+      "epoch": 0.8039111111111111,
+      "grad_norm": 0.3432225042695941,
+      "learning_rate": 1.94995356092196e-05,
+      "loss": 0.5446,
+      "step": 4522
+    },
+    {
+      "epoch": 0.8040888888888889,
+      "grad_norm": 0.3872453157838041,
+      "learning_rate": 1.9465388252536543e-05,
+      "loss": 0.5415,
+      "step": 4523
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.508534416595769,
+      "learning_rate": 1.9431267597225568e-05,
+      "loss": 0.5429,
+      "step": 4524
+    },
+    {
+      "epoch": 0.8044444444444444,
+      "grad_norm": 0.35620050998119435,
+      "learning_rate": 1.939717365459952e-05,
+      "loss": 0.5617,
+      "step": 4525
+    },
+    {
+      "epoch": 0.8046222222222222,
+      "grad_norm": 0.34453399955177694,
+      "learning_rate": 1.9363106435962197e-05,
+      "loss": 0.6012,
+      "step": 4526
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3720643563949712,
+      "learning_rate": 1.932906595260874e-05,
+      "loss": 0.5632,
+      "step": 4527
+    },
+    {
+      "epoch": 0.8049777777777778,
+      "grad_norm": 0.35443001911900646,
+      "learning_rate": 1.9295052215825228e-05,
+      "loss": 0.5429,
+      "step": 4528
+    },
+    {
+      "epoch": 0.8051555555555555,
+      "grad_norm": 0.34949477323600175,
+      "learning_rate": 1.9261065236889066e-05,
+      "loss": 0.5337,
+      "step": 4529
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.36482230095932267,
+      "learning_rate": 1.9227105027068603e-05,
+      "loss": 0.5483,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8055111111111111,
+      "grad_norm": 0.3606895951567287,
+      "learning_rate": 1.9193171597623437e-05,
+      "loss": 0.5604,
+      "step": 4531
+    },
+    {
+      "epoch": 0.8056888888888889,
+      "grad_norm": 0.3530758010834439,
+      "learning_rate": 1.9159264959804247e-05,
+      "loss": 0.5352,
+      "step": 4532
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.34552307378285196,
+      "learning_rate": 1.9125385124852813e-05,
+      "loss": 0.5537,
+      "step": 4533
+    },
+    {
+      "epoch": 0.8060444444444445,
+      "grad_norm": 0.3457387904227504,
+      "learning_rate": 1.9091532104002052e-05,
+      "loss": 0.5378,
+      "step": 4534
+    },
+    {
+      "epoch": 0.8062222222222222,
+      "grad_norm": 0.3760289220757086,
+      "learning_rate": 1.9057705908475998e-05,
+      "loss": 0.6071,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3481072292087357,
+      "learning_rate": 1.9023906549489767e-05,
+      "loss": 0.5649,
+      "step": 4536
+    },
+    {
+      "epoch": 0.8065777777777777,
+      "grad_norm": 0.36904692154365754,
+      "learning_rate": 1.8990134038249585e-05,
+      "loss": 0.5675,
+      "step": 4537
+    },
+    {
+      "epoch": 0.8067555555555556,
+      "grad_norm": 0.34327763958380925,
+      "learning_rate": 1.8956388385952772e-05,
+      "loss": 0.5583,
+      "step": 4538
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.38008301820958273,
+      "learning_rate": 1.8922669603787778e-05,
+      "loss": 0.5433,
+      "step": 4539
+    },
+    {
+      "epoch": 0.8071111111111111,
+      "grad_norm": 0.35588886442620565,
+      "learning_rate": 1.8888977702934085e-05,
+      "loss": 0.5648,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8072888888888888,
+      "grad_norm": 0.3612943294548691,
+      "learning_rate": 1.885531269456231e-05,
+      "loss": 0.5634,
+      "step": 4541
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.3463952388177529,
+      "learning_rate": 1.8821674589834136e-05,
+      "loss": 0.5301,
+      "step": 4542
+    },
+    {
+      "epoch": 0.8076444444444445,
+      "grad_norm": 0.3436568356015561,
+      "learning_rate": 1.8788063399902333e-05,
+      "loss": 0.5811,
+      "step": 4543
+    },
+    {
+      "epoch": 0.8078222222222222,
+      "grad_norm": 0.35952420836401094,
+      "learning_rate": 1.875447913591073e-05,
+      "loss": 0.6169,
+      "step": 4544
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3513013344849414,
+      "learning_rate": 1.8720921808994263e-05,
+      "loss": 0.5715,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8081777777777778,
+      "grad_norm": 0.3301564649722469,
+      "learning_rate": 1.8687391430278845e-05,
+      "loss": 0.5266,
+      "step": 4546
+    },
+    {
+      "epoch": 0.8083555555555556,
+      "grad_norm": 0.3782404872532981,
+      "learning_rate": 1.8653888010881637e-05,
+      "loss": 0.5864,
+      "step": 4547
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.3601340175097413,
+      "learning_rate": 1.862041156191062e-05,
+      "loss": 0.5488,
+      "step": 4548
+    },
+    {
+      "epoch": 0.8087111111111112,
+      "grad_norm": 0.34018039796512534,
+      "learning_rate": 1.8586962094465098e-05,
+      "loss": 0.5577,
+      "step": 4549
+    },
+    {
+      "epoch": 0.8088888888888889,
+      "grad_norm": 0.3499380589292849,
+      "learning_rate": 1.8553539619635153e-05,
+      "loss": 0.599,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.3757626733167595,
+      "learning_rate": 1.852014414850218e-05,
+      "loss": 0.5762,
+      "step": 4551
+    },
+    {
+      "epoch": 0.8092444444444444,
+      "grad_norm": 0.349663344518537,
+      "learning_rate": 1.8486775692138403e-05,
+      "loss": 0.5639,
+      "step": 4552
+    },
+    {
+      "epoch": 0.8094222222222223,
+      "grad_norm": 0.359312478214904,
+      "learning_rate": 1.8453434261607273e-05,
+      "loss": 0.5801,
+      "step": 4553
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3755293871556833,
+      "learning_rate": 1.8420119867963116e-05,
+      "loss": 0.5651,
+      "step": 4554
+    },
+    {
+      "epoch": 0.8097777777777778,
+      "grad_norm": 0.3546961413125564,
+      "learning_rate": 1.8386832522251397e-05,
+      "loss": 0.5486,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8099555555555555,
+      "grad_norm": 0.36844581318373004,
+      "learning_rate": 1.8353572235508576e-05,
+      "loss": 0.5301,
+      "step": 4556
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.3474790977287327,
+      "learning_rate": 1.8320339018762167e-05,
+      "loss": 0.524,
+      "step": 4557
+    },
+    {
+      "epoch": 0.8103111111111111,
+      "grad_norm": 0.3495217777342104,
+      "learning_rate": 1.82871328830307e-05,
+      "loss": 0.5156,
+      "step": 4558
+    },
+    {
+      "epoch": 0.8104888888888889,
+      "grad_norm": 0.3431425975340414,
+      "learning_rate": 1.825395383932369e-05,
+      "loss": 0.5431,
+      "step": 4559
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.3639500557288192,
+      "learning_rate": 1.8220801898641726e-05,
+      "loss": 0.564,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8108444444444445,
+      "grad_norm": 0.34495475604132514,
+      "learning_rate": 1.818767707197636e-05,
+      "loss": 0.5284,
+      "step": 4561
+    },
+    {
+      "epoch": 0.8110222222222222,
+      "grad_norm": 0.38018742762442376,
+      "learning_rate": 1.815457937031021e-05,
+      "loss": 0.5618,
+      "step": 4562
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.36094902973346266,
+      "learning_rate": 1.812150880461684e-05,
+      "loss": 0.5685,
+      "step": 4563
+    },
+    {
+      "epoch": 0.8113777777777778,
+      "grad_norm": 0.3945584192507859,
+      "learning_rate": 1.8088465385860854e-05,
+      "loss": 0.5818,
+      "step": 4564
+    },
+    {
+      "epoch": 0.8115555555555556,
+      "grad_norm": 0.35973079911753336,
+      "learning_rate": 1.805544912499786e-05,
+      "loss": 0.5541,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.3479356219760524,
+      "learning_rate": 1.802246003297443e-05,
+      "loss": 0.5107,
+      "step": 4566
+    },
+    {
+      "epoch": 0.8119111111111111,
+      "grad_norm": 0.3470924510788464,
+      "learning_rate": 1.7989498120728164e-05,
+      "loss": 0.5638,
+      "step": 4567
+    },
+    {
+      "epoch": 0.8120888888888889,
+      "grad_norm": 0.34803119178180775,
+      "learning_rate": 1.795656339918762e-05,
+      "loss": 0.5609,
+      "step": 4568
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.3407490409641714,
+      "learning_rate": 1.7923655879272393e-05,
+      "loss": 0.5312,
+      "step": 4569
+    },
+    {
+      "epoch": 0.8124444444444444,
+      "grad_norm": 0.33705007966929834,
+      "learning_rate": 1.7890775571892936e-05,
+      "loss": 0.5946,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8126222222222222,
+      "grad_norm": 0.33340039400453747,
+      "learning_rate": 1.7857922487950874e-05,
+      "loss": 0.5092,
+      "step": 4571
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.36565609451371067,
+      "learning_rate": 1.782509663833858e-05,
+      "loss": 0.5415,
+      "step": 4572
+    },
+    {
+      "epoch": 0.8129777777777778,
+      "grad_norm": 0.3670837480893755,
+      "learning_rate": 1.7792298033939625e-05,
+      "loss": 0.5683,
+      "step": 4573
+    },
+    {
+      "epoch": 0.8131555555555555,
+      "grad_norm": 0.37623611333394774,
+      "learning_rate": 1.7759526685628335e-05,
+      "loss": 0.5958,
+      "step": 4574
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.34590026979819816,
+      "learning_rate": 1.772678260427021e-05,
+      "loss": 0.5395,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8135111111111111,
+      "grad_norm": 0.3586870247485163,
+      "learning_rate": 1.7694065800721483e-05,
+      "loss": 0.5395,
+      "step": 4576
+    },
+    {
+      "epoch": 0.8136888888888889,
+      "grad_norm": 0.37581326706659135,
+      "learning_rate": 1.7661376285829568e-05,
+      "loss": 0.5226,
+      "step": 4577
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.37136573441191323,
+      "learning_rate": 1.762871407043264e-05,
+      "loss": 0.5617,
+      "step": 4578
+    },
+    {
+      "epoch": 0.8140444444444445,
+      "grad_norm": 0.374560526700098,
+      "learning_rate": 1.7596079165359935e-05,
+      "loss": 0.5139,
+      "step": 4579
+    },
+    {
+      "epoch": 0.8142222222222222,
+      "grad_norm": 0.4127415183735608,
+      "learning_rate": 1.7563471581431624e-05,
+      "loss": 0.6367,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3746678968592116,
+      "learning_rate": 1.7530891329458764e-05,
+      "loss": 0.5527,
+      "step": 4581
+    },
+    {
+      "epoch": 0.8145777777777777,
+      "grad_norm": 0.3688458977138624,
+      "learning_rate": 1.7498338420243422e-05,
+      "loss": 0.6242,
+      "step": 4582
+    },
+    {
+      "epoch": 0.8147555555555556,
+      "grad_norm": 0.3508663867382437,
+      "learning_rate": 1.7465812864578534e-05,
+      "loss": 0.5451,
+      "step": 4583
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.3665796277621071,
+      "learning_rate": 1.7433314673248024e-05,
+      "loss": 0.5438,
+      "step": 4584
+    },
+    {
+      "epoch": 0.8151111111111111,
+      "grad_norm": 0.340318077502143,
+      "learning_rate": 1.7400843857026705e-05,
+      "loss": 0.572,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8152888888888888,
+      "grad_norm": 0.37114039250681113,
+      "learning_rate": 1.736840042668032e-05,
+      "loss": 0.5812,
+      "step": 4586
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.3707407810397685,
+      "learning_rate": 1.7335984392965545e-05,
+      "loss": 0.5779,
+      "step": 4587
+    },
+    {
+      "epoch": 0.8156444444444444,
+      "grad_norm": 0.5453313783553445,
+      "learning_rate": 1.7303595766629955e-05,
+      "loss": 0.5474,
+      "step": 4588
+    },
+    {
+      "epoch": 0.8158222222222222,
+      "grad_norm": 0.3656498505838504,
+      "learning_rate": 1.7271234558412052e-05,
+      "loss": 0.6156,
+      "step": 4589
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3539140344372221,
+      "learning_rate": 1.7238900779041255e-05,
+      "loss": 0.5713,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8161777777777778,
+      "grad_norm": 0.395913449562202,
+      "learning_rate": 1.7206594439237865e-05,
+      "loss": 0.5406,
+      "step": 4591
+    },
+    {
+      "epoch": 0.8163555555555555,
+      "grad_norm": 0.3410503372642445,
+      "learning_rate": 1.7174315549713104e-05,
+      "loss": 0.5618,
+      "step": 4592
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.3471925479117926,
+      "learning_rate": 1.714206412116911e-05,
+      "loss": 0.593,
+      "step": 4593
+    },
+    {
+      "epoch": 0.8167111111111112,
+      "grad_norm": 0.3226915829367458,
+      "learning_rate": 1.7109840164298807e-05,
+      "loss": 0.5123,
+      "step": 4594
+    },
+    {
+      "epoch": 0.8168888888888889,
+      "grad_norm": 0.3676559287810124,
+      "learning_rate": 1.7077643689786215e-05,
+      "loss": 0.596,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.36599664024044093,
+      "learning_rate": 1.704547470830601e-05,
+      "loss": 0.5846,
+      "step": 4596
+    },
+    {
+      "epoch": 0.8172444444444444,
+      "grad_norm": 0.3776913498622677,
+      "learning_rate": 1.7013333230523976e-05,
+      "loss": 0.5545,
+      "step": 4597
+    },
+    {
+      "epoch": 0.8174222222222223,
+      "grad_norm": 0.35568090088044013,
+      "learning_rate": 1.698121926709656e-05,
+      "loss": 0.5608,
+      "step": 4598
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.3556319602517602,
+      "learning_rate": 1.69491328286713e-05,
+      "loss": 0.5871,
+      "step": 4599
+    },
+    {
+      "epoch": 0.8177777777777778,
+      "grad_norm": 0.3726760571642367,
+      "learning_rate": 1.6917073925886406e-05,
+      "loss": 0.5893,
+      "step": 4600
+    },
+    {
+      "epoch": 0.8179555555555555,
+      "grad_norm": 0.36508890664754523,
+      "learning_rate": 1.6885042569371146e-05,
+      "loss": 0.5783,
+      "step": 4601
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.3568767846657704,
+      "learning_rate": 1.6853038769745467e-05,
+      "loss": 0.5191,
+      "step": 4602
+    },
+    {
+      "epoch": 0.8183111111111111,
+      "grad_norm": 0.37184830502349037,
+      "learning_rate": 1.6821062537620356e-05,
+      "loss": 0.559,
+      "step": 4603
+    },
+    {
+      "epoch": 0.8184888888888889,
+      "grad_norm": 0.33484449060124793,
+      "learning_rate": 1.6789113883597595e-05,
+      "loss": 0.5413,
+      "step": 4604
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.39855687864076433,
+      "learning_rate": 1.6757192818269708e-05,
+      "loss": 0.586,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8188444444444445,
+      "grad_norm": 0.37639534985133716,
+      "learning_rate": 1.6725299352220282e-05,
+      "loss": 0.5533,
+      "step": 4606
+    },
+    {
+      "epoch": 0.8190222222222222,
+      "grad_norm": 0.33750942372096643,
+      "learning_rate": 1.6693433496023546e-05,
+      "loss": 0.5423,
+      "step": 4607
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3447269393525857,
+      "learning_rate": 1.6661595260244767e-05,
+      "loss": 0.5476,
+      "step": 4608
+    },
+    {
+      "epoch": 0.8193777777777778,
+      "grad_norm": 0.35257298750261873,
+      "learning_rate": 1.6629784655439872e-05,
+      "loss": 0.5369,
+      "step": 4609
+    },
+    {
+      "epoch": 0.8195555555555556,
+      "grad_norm": 0.3715824000515777,
+      "learning_rate": 1.6598001692155807e-05,
+      "loss": 0.5388,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.3529065893526655,
+      "learning_rate": 1.656624638093016e-05,
+      "loss": 0.5671,
+      "step": 4611
+    },
+    {
+      "epoch": 0.8199111111111111,
+      "grad_norm": 0.3531961411263078,
+      "learning_rate": 1.653451873229156e-05,
+      "loss": 0.5724,
+      "step": 4612
+    },
+    {
+      "epoch": 0.8200888888888889,
+      "grad_norm": 0.35740476891326806,
+      "learning_rate": 1.6502818756759276e-05,
+      "loss": 0.5906,
+      "step": 4613
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.3598842387726135,
+      "learning_rate": 1.64711464648435e-05,
+      "loss": 0.5572,
+      "step": 4614
+    },
+    {
+      "epoch": 0.8204444444444444,
+      "grad_norm": 0.3616519161404355,
+      "learning_rate": 1.6439501867045236e-05,
+      "loss": 0.5886,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8206222222222223,
+      "grad_norm": 0.33252267792247087,
+      "learning_rate": 1.6407884973856313e-05,
+      "loss": 0.5437,
+      "step": 4616
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.36718085592714184,
+      "learning_rate": 1.6376295795759333e-05,
+      "loss": 0.5955,
+      "step": 4617
+    },
+    {
+      "epoch": 0.8209777777777778,
+      "grad_norm": 0.3768386185088991,
+      "learning_rate": 1.634473434322775e-05,
+      "loss": 0.5769,
+      "step": 4618
+    },
+    {
+      "epoch": 0.8211555555555555,
+      "grad_norm": 0.6255451876089728,
+      "learning_rate": 1.6313200626725812e-05,
+      "loss": 0.5602,
+      "step": 4619
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.3602775393772923,
+      "learning_rate": 1.6281694656708568e-05,
+      "loss": 0.5261,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8215111111111111,
+      "grad_norm": 0.35791091421792165,
+      "learning_rate": 1.6250216443621867e-05,
+      "loss": 0.5811,
+      "step": 4621
+    },
+    {
+      "epoch": 0.8216888888888889,
+      "grad_norm": 0.32343170089489137,
+      "learning_rate": 1.6218765997902362e-05,
+      "loss": 0.5004,
+      "step": 4622
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.35279187509927945,
+      "learning_rate": 1.61873433299775e-05,
+      "loss": 0.5708,
+      "step": 4623
+    },
+    {
+      "epoch": 0.8220444444444445,
+      "grad_norm": 0.3538295935007023,
+      "learning_rate": 1.61559484502655e-05,
+      "loss": 0.5079,
+      "step": 4624
+    },
+    {
+      "epoch": 0.8222222222222222,
+      "grad_norm": 0.3606148985539107,
+      "learning_rate": 1.6124581369175396e-05,
+      "loss": 0.5821,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.35663529012171563,
+      "learning_rate": 1.6093242097106986e-05,
+      "loss": 0.5112,
+      "step": 4626
+    },
+    {
+      "epoch": 0.8225777777777777,
+      "grad_norm": 0.3423582697935085,
+      "learning_rate": 1.606193064445085e-05,
+      "loss": 0.5808,
+      "step": 4627
+    },
+    {
+      "epoch": 0.8227555555555556,
+      "grad_norm": 0.35298797109406516,
+      "learning_rate": 1.6030647021588373e-05,
+      "loss": 0.5641,
+      "step": 4628
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.33778769108216783,
+      "learning_rate": 1.5999391238891616e-05,
+      "loss": 0.5288,
+      "step": 4629
+    },
+    {
+      "epoch": 0.8231111111111111,
+      "grad_norm": 0.3715260785403377,
+      "learning_rate": 1.5968163306723572e-05,
+      "loss": 0.5933,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8232888888888888,
+      "grad_norm": 0.35288695134186454,
+      "learning_rate": 1.593696323543783e-05,
+      "loss": 0.5968,
+      "step": 4631
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.33637348736462946,
+      "learning_rate": 1.590579103537889e-05,
+      "loss": 0.5678,
+      "step": 4632
+    },
+    {
+      "epoch": 0.8236444444444444,
+      "grad_norm": 0.3543964142635851,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.579,
+      "step": 4633
+    },
+    {
+      "epoch": 0.8238222222222222,
+      "grad_norm": 0.3533556402221236,
+      "learning_rate": 1.58435302902728e-05,
+      "loss": 0.5574,
+      "step": 4634
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3824685468415375,
+      "learning_rate": 1.5812441765868292e-05,
+      "loss": 0.6465,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8241777777777778,
+      "grad_norm": 0.35155479081329283,
+      "learning_rate": 1.578138115397587e-05,
+      "loss": 0.5558,
+      "step": 4636
+    },
+    {
+      "epoch": 0.8243555555555555,
+      "grad_norm": 0.3403565334633738,
+      "learning_rate": 1.5750348464893683e-05,
+      "loss": 0.5111,
+      "step": 4637
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.35526059669149385,
+      "learning_rate": 1.571934370891066e-05,
+      "loss": 0.5524,
+      "step": 4638
+    },
+    {
+      "epoch": 0.8247111111111111,
+      "grad_norm": 0.34540197164354736,
+      "learning_rate": 1.5688366896306494e-05,
+      "loss": 0.5879,
+      "step": 4639
+    },
+    {
+      "epoch": 0.8248888888888889,
+      "grad_norm": 0.34040304986408365,
+      "learning_rate": 1.565741803735159e-05,
+      "loss": 0.546,
+      "step": 4640
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.39170139567938245,
+      "learning_rate": 1.5626497142307084e-05,
+      "loss": 0.5821,
+      "step": 4641
+    },
+    {
+      "epoch": 0.8252444444444444,
+      "grad_norm": 0.338488145541669,
+      "learning_rate": 1.5595604221424852e-05,
+      "loss": 0.5558,
+      "step": 4642
+    },
+    {
+      "epoch": 0.8254222222222222,
+      "grad_norm": 0.3704198341446206,
+      "learning_rate": 1.5564739284947484e-05,
+      "loss": 0.5431,
+      "step": 4643
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.33742454014359957,
+      "learning_rate": 1.5533902343108286e-05,
+      "loss": 0.5433,
+      "step": 4644
+    },
+    {
+      "epoch": 0.8257777777777778,
+      "grad_norm": 0.35108012187141485,
+      "learning_rate": 1.550309340613132e-05,
+      "loss": 0.5702,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8259555555555556,
+      "grad_norm": 0.36221545891823625,
+      "learning_rate": 1.547231248423132e-05,
+      "loss": 0.5193,
+      "step": 4646
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.4450807898838353,
+      "learning_rate": 1.544155958761374e-05,
+      "loss": 0.5527,
+      "step": 4647
+    },
+    {
+      "epoch": 0.8263111111111111,
+      "grad_norm": 0.3711891715977658,
+      "learning_rate": 1.5410834726474756e-05,
+      "loss": 0.5343,
+      "step": 4648
+    },
+    {
+      "epoch": 0.8264888888888889,
+      "grad_norm": 0.4097167895986613,
+      "learning_rate": 1.5380137911001248e-05,
+      "loss": 0.5802,
+      "step": 4649
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.36882713887124335,
+      "learning_rate": 1.5349469151370776e-05,
+      "loss": 0.588,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8268444444444445,
+      "grad_norm": 0.35511304058788473,
+      "learning_rate": 1.5318828457751634e-05,
+      "loss": 0.5543,
+      "step": 4651
+    },
+    {
+      "epoch": 0.8270222222222222,
+      "grad_norm": 0.3357813184958634,
+      "learning_rate": 1.52882158403028e-05,
+      "loss": 0.5075,
+      "step": 4652
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.33540687207754266,
+      "learning_rate": 1.525763130917387e-05,
+      "loss": 0.5171,
+      "step": 4653
+    },
+    {
+      "epoch": 0.8273777777777778,
+      "grad_norm": 0.35509188796866664,
+      "learning_rate": 1.5227074874505276e-05,
+      "loss": 0.5958,
+      "step": 4654
+    },
+    {
+      "epoch": 0.8275555555555556,
+      "grad_norm": 0.36599668887657677,
+      "learning_rate": 1.519654654642796e-05,
+      "loss": 0.5607,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.3680427195730148,
+      "learning_rate": 1.5166046335063733e-05,
+      "loss": 0.5643,
+      "step": 4656
+    },
+    {
+      "epoch": 0.8279111111111112,
+      "grad_norm": 0.371580746616385,
+      "learning_rate": 1.5135574250524897e-05,
+      "loss": 0.5486,
+      "step": 4657
+    },
+    {
+      "epoch": 0.8280888888888889,
+      "grad_norm": 0.36303029268511805,
+      "learning_rate": 1.5105130302914594e-05,
+      "loss": 0.536,
+      "step": 4658
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.3422375504765315,
+      "learning_rate": 1.5074714502326492e-05,
+      "loss": 0.5249,
+      "step": 4659
+    },
+    {
+      "epoch": 0.8284444444444444,
+      "grad_norm": 0.35983951449013896,
+      "learning_rate": 1.504432685884506e-05,
+      "loss": 0.5779,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8286222222222223,
+      "grad_norm": 0.47904230310632456,
+      "learning_rate": 1.5013967382545324e-05,
+      "loss": 0.5386,
+      "step": 4661
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3622219941103378,
+      "learning_rate": 1.4983636083493014e-05,
+      "loss": 0.6015,
+      "step": 4662
+    },
+    {
+      "epoch": 0.8289777777777778,
+      "grad_norm": 0.32784605092850366,
+      "learning_rate": 1.4953332971744538e-05,
+      "loss": 0.5337,
+      "step": 4663
+    },
+    {
+      "epoch": 0.8291555555555555,
+      "grad_norm": 0.36675632596644536,
+      "learning_rate": 1.4923058057346929e-05,
+      "loss": 0.6006,
+      "step": 4664
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.34932092477537297,
+      "learning_rate": 1.4892811350337876e-05,
+      "loss": 0.5006,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8295111111111111,
+      "grad_norm": 0.3709937673398102,
+      "learning_rate": 1.4862592860745728e-05,
+      "loss": 0.5573,
+      "step": 4666
+    },
+    {
+      "epoch": 0.8296888888888889,
+      "grad_norm": 0.3465718005692071,
+      "learning_rate": 1.4832402598589479e-05,
+      "loss": 0.5674,
+      "step": 4667
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.3766189999754378,
+      "learning_rate": 1.4802240573878733e-05,
+      "loss": 0.5824,
+      "step": 4668
+    },
+    {
+      "epoch": 0.8300444444444445,
+      "grad_norm": 0.3345696668177558,
+      "learning_rate": 1.4772106796613772e-05,
+      "loss": 0.4999,
+      "step": 4669
+    },
+    {
+      "epoch": 0.8302222222222222,
+      "grad_norm": 0.3530872814444731,
+      "learning_rate": 1.4742001276785488e-05,
+      "loss": 0.5865,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3446780163398616,
+      "learning_rate": 1.4711924024375422e-05,
+      "loss": 0.5636,
+      "step": 4671
+    },
+    {
+      "epoch": 0.8305777777777777,
+      "grad_norm": 0.3295083179724757,
+      "learning_rate": 1.468187504935572e-05,
+      "loss": 0.5664,
+      "step": 4672
+    },
+    {
+      "epoch": 0.8307555555555556,
+      "grad_norm": 0.3577560216218309,
+      "learning_rate": 1.4651854361689178e-05,
+      "loss": 0.5677,
+      "step": 4673
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.33439144923605224,
+      "learning_rate": 1.4621861971329187e-05,
+      "loss": 0.5893,
+      "step": 4674
+    },
+    {
+      "epoch": 0.8311111111111111,
+      "grad_norm": 0.35681904113401597,
+      "learning_rate": 1.4591897888219764e-05,
+      "loss": 0.5512,
+      "step": 4675
+    },
+    {
+      "epoch": 0.8312888888888889,
+      "grad_norm": 0.3357108046587461,
+      "learning_rate": 1.4561962122295591e-05,
+      "loss": 0.5385,
+      "step": 4676
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.35755230277570965,
+      "learning_rate": 1.4532054683481832e-05,
+      "loss": 0.559,
+      "step": 4677
+    },
+    {
+      "epoch": 0.8316444444444444,
+      "grad_norm": 0.3579772552384265,
+      "learning_rate": 1.4502175581694443e-05,
+      "loss": 0.5535,
+      "step": 4678
+    },
+    {
+      "epoch": 0.8318222222222222,
+      "grad_norm": 0.34125427623228594,
+      "learning_rate": 1.447232482683979e-05,
+      "loss": 0.5493,
+      "step": 4679
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3413170267507868,
+      "learning_rate": 1.444250242881503e-05,
+      "loss": 0.5737,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8321777777777778,
+      "grad_norm": 0.5051103282140549,
+      "learning_rate": 1.4412708397507724e-05,
+      "loss": 0.5698,
+      "step": 4681
+    },
+    {
+      "epoch": 0.8323555555555555,
+      "grad_norm": 0.3626556516412714,
+      "learning_rate": 1.4382942742796223e-05,
+      "loss": 0.5961,
+      "step": 4682
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.3549541391723931,
+      "learning_rate": 1.4353205474549291e-05,
+      "loss": 0.56,
+      "step": 4683
+    },
+    {
+      "epoch": 0.8327111111111111,
+      "grad_norm": 0.38606667689986,
+      "learning_rate": 1.4323496602626452e-05,
+      "loss": 0.5902,
+      "step": 4684
+    },
+    {
+      "epoch": 0.8328888888888889,
+      "grad_norm": 0.3426806560213806,
+      "learning_rate": 1.4293816136877637e-05,
+      "loss": 0.5625,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.38067762170165054,
+      "learning_rate": 1.4264164087143539e-05,
+      "loss": 0.5785,
+      "step": 4686
+    },
+    {
+      "epoch": 0.8332444444444445,
+      "grad_norm": 0.3456259157854561,
+      "learning_rate": 1.4234540463255263e-05,
+      "loss": 0.5616,
+      "step": 4687
+    },
+    {
+      "epoch": 0.8334222222222222,
+      "grad_norm": 0.3416700683997748,
+      "learning_rate": 1.4204945275034598e-05,
+      "loss": 0.5257,
+      "step": 4688
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.39219776182546007,
+      "learning_rate": 1.417537853229387e-05,
+      "loss": 0.5746,
+      "step": 4689
+    },
+    {
+      "epoch": 0.8337777777777777,
+      "grad_norm": 0.35035732783692075,
+      "learning_rate": 1.4145840244835983e-05,
+      "loss": 0.5536,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8339555555555556,
+      "grad_norm": 0.8101289168537157,
+      "learning_rate": 1.4116330422454394e-05,
+      "loss": 0.5424,
+      "step": 4691
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.3731380113997993,
+      "learning_rate": 1.408684907493314e-05,
+      "loss": 0.5434,
+      "step": 4692
+    },
+    {
+      "epoch": 0.8343111111111111,
+      "grad_norm": 0.34209317962476193,
+      "learning_rate": 1.4057396212046791e-05,
+      "loss": 0.5847,
+      "step": 4693
+    },
+    {
+      "epoch": 0.8344888888888888,
+      "grad_norm": 0.36687853786095037,
+      "learning_rate": 1.4027971843560494e-05,
+      "loss": 0.5787,
+      "step": 4694
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.3708913832996026,
+      "learning_rate": 1.3998575979229944e-05,
+      "loss": 0.5504,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8348444444444444,
+      "grad_norm": 0.3609894673592285,
+      "learning_rate": 1.3969208628801388e-05,
+      "loss": 0.5582,
+      "step": 4696
+    },
+    {
+      "epoch": 0.8350222222222222,
+      "grad_norm": 0.34447180561894797,
+      "learning_rate": 1.3939869802011618e-05,
+      "loss": 0.5484,
+      "step": 4697
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4006985868252093,
+      "learning_rate": 1.391055950858795e-05,
+      "loss": 0.6479,
+      "step": 4698
+    },
+    {
+      "epoch": 0.8353777777777778,
+      "grad_norm": 0.3573360529017714,
+      "learning_rate": 1.3881277758248267e-05,
+      "loss": 0.5531,
+      "step": 4699
+    },
+    {
+      "epoch": 0.8355555555555556,
+      "grad_norm": 0.33213851664597305,
+      "learning_rate": 1.3852024560700982e-05,
+      "loss": 0.5513,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.41589753370762306,
+      "learning_rate": 1.3822799925645036e-05,
+      "loss": 0.5824,
+      "step": 4701
+    },
+    {
+      "epoch": 0.8359111111111112,
+      "grad_norm": 0.3976682442153046,
+      "learning_rate": 1.379360386276991e-05,
+      "loss": 0.5795,
+      "step": 4702
+    },
+    {
+      "epoch": 0.8360888888888889,
+      "grad_norm": 0.3763522462606709,
+      "learning_rate": 1.376443638175554e-05,
+      "loss": 0.5923,
+      "step": 4703
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.37430688082659974,
+      "learning_rate": 1.373529749227256e-05,
+      "loss": 0.604,
+      "step": 4704
+    },
+    {
+      "epoch": 0.8364444444444444,
+      "grad_norm": 0.3366737118690265,
+      "learning_rate": 1.370618720398189e-05,
+      "loss": 0.5547,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8366222222222223,
+      "grad_norm": 0.33683852653720964,
+      "learning_rate": 1.3677105526535194e-05,
+      "loss": 0.5751,
+      "step": 4706
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3478412773567568,
+      "learning_rate": 1.3648052469574469e-05,
+      "loss": 0.5882,
+      "step": 4707
+    },
+    {
+      "epoch": 0.8369777777777778,
+      "grad_norm": 0.34079237360477427,
+      "learning_rate": 1.3619028042732373e-05,
+      "loss": 0.5661,
+      "step": 4708
+    },
+    {
+      "epoch": 0.8371555555555555,
+      "grad_norm": 0.3692521434195599,
+      "learning_rate": 1.3590032255631912e-05,
+      "loss": 0.5918,
+      "step": 4709
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.34799369761895843,
+      "learning_rate": 1.3561065117886783e-05,
+      "loss": 0.5549,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8375111111111111,
+      "grad_norm": 0.33730414740936115,
+      "learning_rate": 1.3532126639100995e-05,
+      "loss": 0.5323,
+      "step": 4711
+    },
+    {
+      "epoch": 0.8376888888888889,
+      "grad_norm": 0.3452876062552767,
+      "learning_rate": 1.3503216828869192e-05,
+      "loss": 0.5286,
+      "step": 4712
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.3535072696111852,
+      "learning_rate": 1.3474335696776453e-05,
+      "loss": 0.5694,
+      "step": 4713
+    },
+    {
+      "epoch": 0.8380444444444445,
+      "grad_norm": 0.35005022069257885,
+      "learning_rate": 1.344548325239835e-05,
+      "loss": 0.5851,
+      "step": 4714
+    },
+    {
+      "epoch": 0.8382222222222222,
+      "grad_norm": 0.3657775293257551,
+      "learning_rate": 1.3416659505300977e-05,
+      "loss": 0.5443,
+      "step": 4715
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3453286227490814,
+      "learning_rate": 1.3387864465040877e-05,
+      "loss": 0.5563,
+      "step": 4716
+    },
+    {
+      "epoch": 0.8385777777777778,
+      "grad_norm": 0.3341158471072853,
+      "learning_rate": 1.3359098141165093e-05,
+      "loss": 0.5348,
+      "step": 4717
+    },
+    {
+      "epoch": 0.8387555555555556,
+      "grad_norm": 0.3546163479163679,
+      "learning_rate": 1.333036054321114e-05,
+      "loss": 0.5437,
+      "step": 4718
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.3737540011008875,
+      "learning_rate": 1.3301651680707018e-05,
+      "loss": 0.5805,
+      "step": 4719
+    },
+    {
+      "epoch": 0.8391111111111111,
+      "grad_norm": 0.3722643561649962,
+      "learning_rate": 1.3272971563171189e-05,
+      "loss": 0.5949,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8392888888888889,
+      "grad_norm": 0.3707395251000393,
+      "learning_rate": 1.3244320200112592e-05,
+      "loss": 0.5432,
+      "step": 4721
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.37836128676167874,
+      "learning_rate": 1.321569760103063e-05,
+      "loss": 0.6212,
+      "step": 4722
+    },
+    {
+      "epoch": 0.8396444444444444,
+      "grad_norm": 0.3544421358498244,
+      "learning_rate": 1.3187103775415156e-05,
+      "loss": 0.5936,
+      "step": 4723
+    },
+    {
+      "epoch": 0.8398222222222222,
+      "grad_norm": 0.33515258921654983,
+      "learning_rate": 1.3158538732746517e-05,
+      "loss": 0.4993,
+      "step": 4724
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.34699889471605555,
+      "learning_rate": 1.3130002482495485e-05,
+      "loss": 0.5541,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8401777777777778,
+      "grad_norm": 0.35462966752122066,
+      "learning_rate": 1.3101495034123313e-05,
+      "loss": 0.6124,
+      "step": 4726
+    },
+    {
+      "epoch": 0.8403555555555555,
+      "grad_norm": 0.34788234745486424,
+      "learning_rate": 1.3073016397081638e-05,
+      "loss": 0.5097,
+      "step": 4727
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.3536679810195945,
+      "learning_rate": 1.3044566580812668e-05,
+      "loss": 0.556,
+      "step": 4728
+    },
+    {
+      "epoch": 0.8407111111111111,
+      "grad_norm": 0.35160306526637813,
+      "learning_rate": 1.3016145594748907e-05,
+      "loss": 0.5345,
+      "step": 4729
+    },
+    {
+      "epoch": 0.8408888888888889,
+      "grad_norm": 0.35094622854145036,
+      "learning_rate": 1.2987753448313456e-05,
+      "loss": 0.5662,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.3342374365740332,
+      "learning_rate": 1.2959390150919681e-05,
+      "loss": 0.571,
+      "step": 4731
+    },
+    {
+      "epoch": 0.8412444444444445,
+      "grad_norm": 0.3438614824877184,
+      "learning_rate": 1.2931055711971574e-05,
+      "loss": 0.537,
+      "step": 4732
+    },
+    {
+      "epoch": 0.8414222222222222,
+      "grad_norm": 0.37726351991721147,
+      "learning_rate": 1.2902750140863373e-05,
+      "loss": 0.5907,
+      "step": 4733
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.36288867754957466,
+      "learning_rate": 1.2874473446979918e-05,
+      "loss": 0.5746,
+      "step": 4734
+    },
+    {
+      "epoch": 0.8417777777777777,
+      "grad_norm": 0.38479088388874017,
+      "learning_rate": 1.2846225639696318e-05,
+      "loss": 0.5817,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8419555555555556,
+      "grad_norm": 0.36044158051225755,
+      "learning_rate": 1.2818006728378219e-05,
+      "loss": 0.6072,
+      "step": 4736
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.3322545508931253,
+      "learning_rate": 1.278981672238161e-05,
+      "loss": 0.5309,
+      "step": 4737
+    },
+    {
+      "epoch": 0.8423111111111111,
+      "grad_norm": 0.34998865222203523,
+      "learning_rate": 1.276165563105296e-05,
+      "loss": 0.5469,
+      "step": 4738
+    },
+    {
+      "epoch": 0.8424888888888888,
+      "grad_norm": 0.3614983042391235,
+      "learning_rate": 1.2733523463729102e-05,
+      "loss": 0.5456,
+      "step": 4739
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.3621769330883001,
+      "learning_rate": 1.2705420229737307e-05,
+      "loss": 0.5717,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8428444444444444,
+      "grad_norm": 0.3453553891206609,
+      "learning_rate": 1.2677345938395247e-05,
+      "loss": 0.5683,
+      "step": 4741
+    },
+    {
+      "epoch": 0.8430222222222222,
+      "grad_norm": 0.35642392507552145,
+      "learning_rate": 1.2649300599010993e-05,
+      "loss": 0.5589,
+      "step": 4742
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.36333954238460286,
+      "learning_rate": 1.2621284220883011e-05,
+      "loss": 0.6244,
+      "step": 4743
+    },
+    {
+      "epoch": 0.8433777777777778,
+      "grad_norm": 0.34318169765839596,
+      "learning_rate": 1.2593296813300193e-05,
+      "loss": 0.5744,
+      "step": 4744
+    },
+    {
+      "epoch": 0.8435555555555555,
+      "grad_norm": 0.3655856012168314,
+      "learning_rate": 1.2565338385541792e-05,
+      "loss": 0.5598,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.34238761557465225,
+      "learning_rate": 1.253740894687747e-05,
+      "loss": 0.5552,
+      "step": 4746
+    },
+    {
+      "epoch": 0.8439111111111111,
+      "grad_norm": 0.34912156269278777,
+      "learning_rate": 1.250950850656727e-05,
+      "loss": 0.521,
+      "step": 4747
+    },
+    {
+      "epoch": 0.8440888888888889,
+      "grad_norm": 0.3985107189538369,
+      "learning_rate": 1.248163707386163e-05,
+      "loss": 0.6066,
+      "step": 4748
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.33137349715867626,
+      "learning_rate": 1.2453794658001371e-05,
+      "loss": 0.544,
+      "step": 4749
+    },
+    {
+      "epoch": 0.8444444444444444,
+      "grad_norm": 0.35943001535811403,
+      "learning_rate": 1.242598126821769e-05,
+      "loss": 0.5482,
+      "step": 4750
+    },
+    {
+      "epoch": 0.8446222222222223,
+      "grad_norm": 0.351013040159158,
+      "learning_rate": 1.2398196913732118e-05,
+      "loss": 0.5661,
+      "step": 4751
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3442254909732594,
+      "learning_rate": 1.2370441603756677e-05,
+      "loss": 0.57,
+      "step": 4752
+    },
+    {
+      "epoch": 0.8449777777777778,
+      "grad_norm": 0.399703745467623,
+      "learning_rate": 1.2342715347493594e-05,
+      "loss": 0.5219,
+      "step": 4753
+    },
+    {
+      "epoch": 0.8451555555555555,
+      "grad_norm": 0.33405618723654806,
+      "learning_rate": 1.2315018154135626e-05,
+      "loss": 0.542,
+      "step": 4754
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.4422806735381209,
+      "learning_rate": 1.2287350032865763e-05,
+      "loss": 0.5598,
+      "step": 4755
+    },
+    {
+      "epoch": 0.8455111111111111,
+      "grad_norm": 0.35883067303193905,
+      "learning_rate": 1.2259710992857465e-05,
+      "loss": 0.5566,
+      "step": 4756
+    },
+    {
+      "epoch": 0.8456888888888889,
+      "grad_norm": 0.3750495629289882,
+      "learning_rate": 1.2232101043274436e-05,
+      "loss": 0.5646,
+      "step": 4757
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.3679931369848527,
+      "learning_rate": 1.2204520193270863e-05,
+      "loss": 0.5854,
+      "step": 4758
+    },
+    {
+      "epoch": 0.8460444444444445,
+      "grad_norm": 0.3552643263327099,
+      "learning_rate": 1.2176968451991166e-05,
+      "loss": 0.5749,
+      "step": 4759
+    },
+    {
+      "epoch": 0.8462222222222222,
+      "grad_norm": 0.38476147490687185,
+      "learning_rate": 1.2149445828570195e-05,
+      "loss": 0.5859,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3642212026187823,
+      "learning_rate": 1.2121952332133091e-05,
+      "loss": 0.5524,
+      "step": 4761
+    },
+    {
+      "epoch": 0.8465777777777778,
+      "grad_norm": 0.35584989368735986,
+      "learning_rate": 1.2094487971795398e-05,
+      "loss": 0.5268,
+      "step": 4762
+    },
+    {
+      "epoch": 0.8467555555555556,
+      "grad_norm": 0.34108832633769864,
+      "learning_rate": 1.2067052756662945e-05,
+      "loss": 0.507,
+      "step": 4763
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.34378925720418096,
+      "learning_rate": 1.2039646695831918e-05,
+      "loss": 0.5772,
+      "step": 4764
+    },
+    {
+      "epoch": 0.8471111111111111,
+      "grad_norm": 0.41049911433418446,
+      "learning_rate": 1.2012269798388842e-05,
+      "loss": 0.5434,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8472888888888889,
+      "grad_norm": 0.3767381869755635,
+      "learning_rate": 1.1984922073410576e-05,
+      "loss": 0.5489,
+      "step": 4766
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.3721096722238437,
+      "learning_rate": 1.195760352996429e-05,
+      "loss": 0.5695,
+      "step": 4767
+    },
+    {
+      "epoch": 0.8476444444444444,
+      "grad_norm": 0.362848093784351,
+      "learning_rate": 1.1930314177107493e-05,
+      "loss": 0.5459,
+      "step": 4768
+    },
+    {
+      "epoch": 0.8478222222222223,
+      "grad_norm": 0.34717286324703983,
+      "learning_rate": 1.1903054023888017e-05,
+      "loss": 0.5899,
+      "step": 4769
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3545219124315328,
+      "learning_rate": 1.1875823079343996e-05,
+      "loss": 0.5747,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8481777777777778,
+      "grad_norm": 0.34753265955345275,
+      "learning_rate": 1.1848621352503885e-05,
+      "loss": 0.5376,
+      "step": 4771
+    },
+    {
+      "epoch": 0.8483555555555555,
+      "grad_norm": 0.36568871995128516,
+      "learning_rate": 1.1821448852386475e-05,
+      "loss": 0.5473,
+      "step": 4772
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.4502738349367513,
+      "learning_rate": 1.1794305588000843e-05,
+      "loss": 0.6042,
+      "step": 4773
+    },
+    {
+      "epoch": 0.8487111111111111,
+      "grad_norm": 0.3670197534523983,
+      "learning_rate": 1.1767191568346392e-05,
+      "loss": 0.5677,
+      "step": 4774
+    },
+    {
+      "epoch": 0.8488888888888889,
+      "grad_norm": 0.35559272170060463,
+      "learning_rate": 1.1740106802412765e-05,
+      "loss": 0.5809,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.35541789322148615,
+      "learning_rate": 1.1713051299180044e-05,
+      "loss": 0.5343,
+      "step": 4776
+    },
+    {
+      "epoch": 0.8492444444444445,
+      "grad_norm": 0.35009086622697455,
+      "learning_rate": 1.1686025067618423e-05,
+      "loss": 0.5456,
+      "step": 4777
+    },
+    {
+      "epoch": 0.8494222222222222,
+      "grad_norm": 0.36433975410470026,
+      "learning_rate": 1.1659028116688575e-05,
+      "loss": 0.5535,
+      "step": 4778
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.6527544694719213,
+      "learning_rate": 1.163206045534131e-05,
+      "loss": 0.5668,
+      "step": 4779
+    },
+    {
+      "epoch": 0.8497777777777777,
+      "grad_norm": 0.34961369799225933,
+      "learning_rate": 1.1605122092517874e-05,
+      "loss": 0.5416,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8499555555555556,
+      "grad_norm": 0.34719313568414933,
+      "learning_rate": 1.1578213037149633e-05,
+      "loss": 0.5738,
+      "step": 4781
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.3538687342981935,
+      "learning_rate": 1.1551333298158407e-05,
+      "loss": 0.5439,
+      "step": 4782
+    },
+    {
+      "epoch": 0.8503111111111111,
+      "grad_norm": 0.34614117005604167,
+      "learning_rate": 1.1524482884456146e-05,
+      "loss": 0.56,
+      "step": 4783
+    },
+    {
+      "epoch": 0.8504888888888888,
+      "grad_norm": 0.39264264525109654,
+      "learning_rate": 1.1497661804945215e-05,
+      "loss": 0.5929,
+      "step": 4784
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.3910874273180067,
+      "learning_rate": 1.1470870068518113e-05,
+      "loss": 0.5485,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8508444444444444,
+      "grad_norm": 0.3608068510718258,
+      "learning_rate": 1.1444107684057725e-05,
+      "loss": 0.5767,
+      "step": 4786
+    },
+    {
+      "epoch": 0.8510222222222222,
+      "grad_norm": 0.37414130238485216,
+      "learning_rate": 1.1417374660437153e-05,
+      "loss": 0.5872,
+      "step": 4787
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.36434561219039396,
+      "learning_rate": 1.139067100651976e-05,
+      "loss": 0.5596,
+      "step": 4788
+    },
+    {
+      "epoch": 0.8513777777777778,
+      "grad_norm": 0.36067319438993584,
+      "learning_rate": 1.1363996731159188e-05,
+      "loss": 0.5985,
+      "step": 4789
+    },
+    {
+      "epoch": 0.8515555555555555,
+      "grad_norm": 0.35726958111512597,
+      "learning_rate": 1.1337351843199329e-05,
+      "loss": 0.5697,
+      "step": 4790
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.33676148330206906,
+      "learning_rate": 1.131073635147435e-05,
+      "loss": 0.5334,
+      "step": 4791
+    },
+    {
+      "epoch": 0.8519111111111111,
+      "grad_norm": 0.3504475677876654,
+      "learning_rate": 1.1284150264808647e-05,
+      "loss": 0.5481,
+      "step": 4792
+    },
+    {
+      "epoch": 0.8520888888888889,
+      "grad_norm": 0.34021128509758136,
+      "learning_rate": 1.1257593592016868e-05,
+      "loss": 0.5484,
+      "step": 4793
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.34753025891454314,
+      "learning_rate": 1.123106634190394e-05,
+      "loss": 0.5271,
+      "step": 4794
+    },
+    {
+      "epoch": 0.8524444444444444,
+      "grad_norm": 0.4117510292340522,
+      "learning_rate": 1.1204568523265002e-05,
+      "loss": 0.5446,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8526222222222222,
+      "grad_norm": 0.3696257018451269,
+      "learning_rate": 1.117810014488544e-05,
+      "loss": 0.5705,
+      "step": 4796
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.33889651336059107,
+      "learning_rate": 1.1151661215540888e-05,
+      "loss": 0.5597,
+      "step": 4797
+    },
+    {
+      "epoch": 0.8529777777777777,
+      "grad_norm": 0.34895748706017543,
+      "learning_rate": 1.1125251743997223e-05,
+      "loss": 0.5562,
+      "step": 4798
+    },
+    {
+      "epoch": 0.8531555555555556,
+      "grad_norm": 0.34362043759441874,
+      "learning_rate": 1.109887173901053e-05,
+      "loss": 0.5306,
+      "step": 4799
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.3898415608484958,
+      "learning_rate": 1.107252120932717e-05,
+      "loss": 0.5661,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8535111111111111,
+      "grad_norm": 0.3415374936095161,
+      "learning_rate": 1.104620016368364e-05,
+      "loss": 0.5626,
+      "step": 4801
+    },
+    {
+      "epoch": 0.8536888888888889,
+      "grad_norm": 0.37522158181667603,
+      "learning_rate": 1.1019908610806794e-05,
+      "loss": 0.6005,
+      "step": 4802
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.36131441899660666,
+      "learning_rate": 1.0993646559413572e-05,
+      "loss": 0.5891,
+      "step": 4803
+    },
+    {
+      "epoch": 0.8540444444444445,
+      "grad_norm": 0.34241938894189483,
+      "learning_rate": 1.0967414018211264e-05,
+      "loss": 0.5202,
+      "step": 4804
+    },
+    {
+      "epoch": 0.8542222222222222,
+      "grad_norm": 0.34719160311850034,
+      "learning_rate": 1.0941210995897223e-05,
+      "loss": 0.5645,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.34985427851070866,
+      "learning_rate": 1.0915037501159197e-05,
+      "loss": 0.5635,
+      "step": 4806
+    },
+    {
+      "epoch": 0.8545777777777778,
+      "grad_norm": 0.36366812435766566,
+      "learning_rate": 1.0888893542674949e-05,
+      "loss": 0.5814,
+      "step": 4807
+    },
+    {
+      "epoch": 0.8547555555555556,
+      "grad_norm": 0.366244830038732,
+      "learning_rate": 1.0862779129112654e-05,
+      "loss": 0.5615,
+      "step": 4808
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.40703197904146177,
+      "learning_rate": 1.0836694269130498e-05,
+      "loss": 0.5014,
+      "step": 4809
+    },
+    {
+      "epoch": 0.8551111111111112,
+      "grad_norm": 0.3506976385982676,
+      "learning_rate": 1.0810638971376996e-05,
+      "loss": 0.5623,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8552888888888889,
+      "grad_norm": 0.3442271804689803,
+      "learning_rate": 1.0784613244490816e-05,
+      "loss": 0.5678,
+      "step": 4811
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.3392160167742886,
+      "learning_rate": 1.075861709710081e-05,
+      "loss": 0.5474,
+      "step": 4812
+    },
+    {
+      "epoch": 0.8556444444444444,
+      "grad_norm": 0.3332201752522239,
+      "learning_rate": 1.0732650537826061e-05,
+      "loss": 0.4998,
+      "step": 4813
+    },
+    {
+      "epoch": 0.8558222222222223,
+      "grad_norm": 0.3423359445476259,
+      "learning_rate": 1.0706713575275817e-05,
+      "loss": 0.5675,
+      "step": 4814
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.36234805255293767,
+      "learning_rate": 1.068080621804951e-05,
+      "loss": 0.6019,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8561777777777778,
+      "grad_norm": 0.35662999085142344,
+      "learning_rate": 1.065492847473677e-05,
+      "loss": 0.5682,
+      "step": 4816
+    },
+    {
+      "epoch": 0.8563555555555555,
+      "grad_norm": 0.3739820360524947,
+      "learning_rate": 1.0629080353917397e-05,
+      "loss": 0.5684,
+      "step": 4817
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.37954177131554695,
+      "learning_rate": 1.0603261864161384e-05,
+      "loss": 0.5715,
+      "step": 4818
+    },
+    {
+      "epoch": 0.8567111111111111,
+      "grad_norm": 0.356836286059798,
+      "learning_rate": 1.057747301402887e-05,
+      "loss": 0.5479,
+      "step": 4819
+    },
+    {
+      "epoch": 0.8568888888888889,
+      "grad_norm": 0.3871129480144662,
+      "learning_rate": 1.0551713812070207e-05,
+      "loss": 0.5694,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.3605913057905761,
+      "learning_rate": 1.0525984266825895e-05,
+      "loss": 0.5509,
+      "step": 4821
+    },
+    {
+      "epoch": 0.8572444444444445,
+      "grad_norm": 0.33998578271790825,
+      "learning_rate": 1.0500284386826597e-05,
+      "loss": 0.5954,
+      "step": 4822
+    },
+    {
+      "epoch": 0.8574222222222222,
+      "grad_norm": 0.34637790273855507,
+      "learning_rate": 1.0474614180593145e-05,
+      "loss": 0.5616,
+      "step": 4823
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.39437822277370416,
+      "learning_rate": 1.0448973656636562e-05,
+      "loss": 0.6021,
+      "step": 4824
+    },
+    {
+      "epoch": 0.8577777777777778,
+      "grad_norm": 0.3525446263767972,
+      "learning_rate": 1.0423362823457939e-05,
+      "loss": 0.6184,
+      "step": 4825
+    },
+    {
+      "epoch": 0.8579555555555556,
+      "grad_norm": 0.5661658862408048,
+      "learning_rate": 1.0397781689548669e-05,
+      "loss": 0.5526,
+      "step": 4826
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.35874014994062636,
+      "learning_rate": 1.0372230263390125e-05,
+      "loss": 0.5453,
+      "step": 4827
+    },
+    {
+      "epoch": 0.8583111111111111,
+      "grad_norm": 0.35445839982564686,
+      "learning_rate": 1.034670855345402e-05,
+      "loss": 0.5441,
+      "step": 4828
+    },
+    {
+      "epoch": 0.8584888888888889,
+      "grad_norm": 0.347106712554299,
+      "learning_rate": 1.032121656820202e-05,
+      "loss": 0.562,
+      "step": 4829
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.37276570010639615,
+      "learning_rate": 1.0295754316086114e-05,
+      "loss": 0.5663,
+      "step": 4830
+    },
+    {
+      "epoch": 0.8588444444444444,
+      "grad_norm": 0.3631701087001613,
+      "learning_rate": 1.0270321805548267e-05,
+      "loss": 0.5171,
+      "step": 4831
+    },
+    {
+      "epoch": 0.8590222222222222,
+      "grad_norm": 0.3589204793548821,
+      "learning_rate": 1.0244919045020763e-05,
+      "loss": 0.5349,
+      "step": 4832
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.34921926184657986,
+      "learning_rate": 1.0219546042925843e-05,
+      "loss": 0.5683,
+      "step": 4833
+    },
+    {
+      "epoch": 0.8593777777777778,
+      "grad_norm": 0.34527175198358123,
+      "learning_rate": 1.0194202807676e-05,
+      "loss": 0.5742,
+      "step": 4834
+    },
+    {
+      "epoch": 0.8595555555555555,
+      "grad_norm": 0.3613224906681506,
+      "learning_rate": 1.0168889347673816e-05,
+      "loss": 0.5722,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.37560789234311237,
+      "learning_rate": 1.0143605671312018e-05,
+      "loss": 0.5931,
+      "step": 4836
+    },
+    {
+      "epoch": 0.8599111111111111,
+      "grad_norm": 0.3442759842208663,
+      "learning_rate": 1.0118351786973423e-05,
+      "loss": 0.5554,
+      "step": 4837
+    },
+    {
+      "epoch": 0.8600888888888889,
+      "grad_norm": 0.33436241321031435,
+      "learning_rate": 1.0093127703031013e-05,
+      "loss": 0.5055,
+      "step": 4838
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.3747203433840497,
+      "learning_rate": 1.0067933427847864e-05,
+      "loss": 0.5545,
+      "step": 4839
+    },
+    {
+      "epoch": 0.8604444444444445,
+      "grad_norm": 0.34718261173951354,
+      "learning_rate": 1.0042768969777183e-05,
+      "loss": 0.5695,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8606222222222222,
+      "grad_norm": 0.3634506145474074,
+      "learning_rate": 1.0017634337162275e-05,
+      "loss": 0.5901,
+      "step": 4841
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.34979519930162906,
+      "learning_rate": 9.992529538336571e-06,
+      "loss": 0.5664,
+      "step": 4842
+    },
+    {
+      "epoch": 0.8609777777777777,
+      "grad_norm": 0.34677175722720904,
+      "learning_rate": 9.967454581623603e-06,
+      "loss": 0.581,
+      "step": 4843
+    },
+    {
+      "epoch": 0.8611555555555556,
+      "grad_norm": 0.3930518351612051,
+      "learning_rate": 9.942409475337012e-06,
+      "loss": 0.5689,
+      "step": 4844
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.35651409477168866,
+      "learning_rate": 9.91739422778054e-06,
+      "loss": 0.5614,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8615111111111111,
+      "grad_norm": 0.3559157065269722,
+      "learning_rate": 9.892408847248037e-06,
+      "loss": 0.548,
+      "step": 4846
+    },
+    {
+      "epoch": 0.8616888888888888,
+      "grad_norm": 0.3914772523093115,
+      "learning_rate": 9.867453342023437e-06,
+      "loss": 0.5753,
+      "step": 4847
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.35091384603381803,
+      "learning_rate": 9.84252772038079e-06,
+      "loss": 0.5464,
+      "step": 4848
+    },
+    {
+      "epoch": 0.8620444444444444,
+      "grad_norm": 0.38537027767745874,
+      "learning_rate": 9.817631990584165e-06,
+      "loss": 0.5881,
+      "step": 4849
+    },
+    {
+      "epoch": 0.8622222222222222,
+      "grad_norm": 0.3522400719219844,
+      "learning_rate": 9.792766160887868e-06,
+      "loss": 0.5615,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.3502685764426094,
+      "learning_rate": 9.767930239536115e-06,
+      "loss": 0.5788,
+      "step": 4851
+    },
+    {
+      "epoch": 0.8625777777777778,
+      "grad_norm": 0.3394684151860436,
+      "learning_rate": 9.74312423476338e-06,
+      "loss": 0.5599,
+      "step": 4852
+    },
+    {
+      "epoch": 0.8627555555555556,
+      "grad_norm": 0.3582072624629696,
+      "learning_rate": 9.718348154794044e-06,
+      "loss": 0.5519,
+      "step": 4853
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.37811560194121324,
+      "learning_rate": 9.69360200784274e-06,
+      "loss": 0.5697,
+      "step": 4854
+    },
+    {
+      "epoch": 0.8631111111111112,
+      "grad_norm": 0.3570092818149156,
+      "learning_rate": 9.668885802114003e-06,
+      "loss": 0.601,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8632888888888889,
+      "grad_norm": 0.36027646511403677,
+      "learning_rate": 9.644199545802612e-06,
+      "loss": 0.5576,
+      "step": 4856
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.3392549055499985,
+      "learning_rate": 9.619543247093254e-06,
+      "loss": 0.5366,
+      "step": 4857
+    },
+    {
+      "epoch": 0.8636444444444444,
+      "grad_norm": 0.3648377579392886,
+      "learning_rate": 9.594916914160846e-06,
+      "loss": 0.5161,
+      "step": 4858
+    },
+    {
+      "epoch": 0.8638222222222223,
+      "grad_norm": 0.35677967372684327,
+      "learning_rate": 9.570320555170209e-06,
+      "loss": 0.5632,
+      "step": 4859
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4300638488986734,
+      "learning_rate": 9.545754178276344e-06,
+      "loss": 0.5627,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8641777777777778,
+      "grad_norm": 0.3688963784810361,
+      "learning_rate": 9.52121779162426e-06,
+      "loss": 0.5917,
+      "step": 4861
+    },
+    {
+      "epoch": 0.8643555555555555,
+      "grad_norm": 0.3583213105927634,
+      "learning_rate": 9.496711403349034e-06,
+      "loss": 0.5708,
+      "step": 4862
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.36842538116585044,
+      "learning_rate": 9.472235021575792e-06,
+      "loss": 0.5701,
+      "step": 4863
+    },
+    {
+      "epoch": 0.8647111111111111,
+      "grad_norm": 0.36609889257191197,
+      "learning_rate": 9.44778865441972e-06,
+      "loss": 0.5553,
+      "step": 4864
+    },
+    {
+      "epoch": 0.8648888888888889,
+      "grad_norm": 0.3764202076089415,
+      "learning_rate": 9.423372309986056e-06,
+      "loss": 0.5559,
+      "step": 4865
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.3581394101407454,
+      "learning_rate": 9.398985996370058e-06,
+      "loss": 0.5998,
+      "step": 4866
+    },
+    {
+      "epoch": 0.8652444444444445,
+      "grad_norm": 0.37110784997986695,
+      "learning_rate": 9.374629721657058e-06,
+      "loss": 0.5732,
+      "step": 4867
+    },
+    {
+      "epoch": 0.8654222222222222,
+      "grad_norm": 0.3588890955589848,
+      "learning_rate": 9.350303493922407e-06,
+      "loss": 0.5822,
+      "step": 4868
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.38172146199191637,
+      "learning_rate": 9.326007321231522e-06,
+      "loss": 0.5895,
+      "step": 4869
+    },
+    {
+      "epoch": 0.8657777777777778,
+      "grad_norm": 0.41367249115110094,
+      "learning_rate": 9.301741211639803e-06,
+      "loss": 0.5201,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8659555555555556,
+      "grad_norm": 0.3867458667685116,
+      "learning_rate": 9.277505173192746e-06,
+      "loss": 0.5669,
+      "step": 4871
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.3387990707696137,
+      "learning_rate": 9.253299213925847e-06,
+      "loss": 0.5638,
+      "step": 4872
+    },
+    {
+      "epoch": 0.8663111111111111,
+      "grad_norm": 0.4121068231758332,
+      "learning_rate": 9.229123341864577e-06,
+      "loss": 0.5368,
+      "step": 4873
+    },
+    {
+      "epoch": 0.8664888888888889,
+      "grad_norm": 0.40836474736598294,
+      "learning_rate": 9.204977565024564e-06,
+      "loss": 0.575,
+      "step": 4874
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.33781376916020694,
+      "learning_rate": 9.180861891411296e-06,
+      "loss": 0.5285,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8668444444444444,
+      "grad_norm": 0.37513893030639056,
+      "learning_rate": 9.156776329020434e-06,
+      "loss": 0.5989,
+      "step": 4876
+    },
+    {
+      "epoch": 0.8670222222222222,
+      "grad_norm": 0.3459116859150284,
+      "learning_rate": 9.13272088583751e-06,
+      "loss": 0.5542,
+      "step": 4877
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3748561217449105,
+      "learning_rate": 9.108695569838211e-06,
+      "loss": 0.6067,
+      "step": 4878
+    },
+    {
+      "epoch": 0.8673777777777778,
+      "grad_norm": 0.37166177876823725,
+      "learning_rate": 9.08470038898811e-06,
+      "loss": 0.5838,
+      "step": 4879
+    },
+    {
+      "epoch": 0.8675555555555555,
+      "grad_norm": 0.3639293533177192,
+      "learning_rate": 9.0607353512429e-06,
+      "loss": 0.5805,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.3524614450585736,
+      "learning_rate": 9.036800464548157e-06,
+      "loss": 0.5526,
+      "step": 4881
+    },
+    {
+      "epoch": 0.8679111111111111,
+      "grad_norm": 0.35216720341116686,
+      "learning_rate": 9.01289573683961e-06,
+      "loss": 0.5722,
+      "step": 4882
+    },
+    {
+      "epoch": 0.8680888888888889,
+      "grad_norm": 0.34458582148301775,
+      "learning_rate": 8.989021176042844e-06,
+      "loss": 0.5545,
+      "step": 4883
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.3535534854360052,
+      "learning_rate": 8.965176790073537e-06,
+      "loss": 0.5945,
+      "step": 4884
+    },
+    {
+      "epoch": 0.8684444444444445,
+      "grad_norm": 0.37994299135993975,
+      "learning_rate": 8.941362586837309e-06,
+      "loss": 0.5493,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8686222222222222,
+      "grad_norm": 0.35331124290447796,
+      "learning_rate": 8.917578574229812e-06,
+      "loss": 0.5588,
+      "step": 4886
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.36356216034730354,
+      "learning_rate": 8.89382476013667e-06,
+      "loss": 0.4765,
+      "step": 4887
+    },
+    {
+      "epoch": 0.8689777777777777,
+      "grad_norm": 0.3455222872070977,
+      "learning_rate": 8.870101152433497e-06,
+      "loss": 0.5255,
+      "step": 4888
+    },
+    {
+      "epoch": 0.8691555555555556,
+      "grad_norm": 0.35327539618497183,
+      "learning_rate": 8.846407758985886e-06,
+      "loss": 0.5875,
+      "step": 4889
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.3814891567007966,
+      "learning_rate": 8.822744587649412e-06,
+      "loss": 0.5812,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8695111111111111,
+      "grad_norm": 0.3462120641494648,
+      "learning_rate": 8.799111646269642e-06,
+      "loss": 0.5569,
+      "step": 4891
+    },
+    {
+      "epoch": 0.8696888888888888,
+      "grad_norm": 0.3302997355712388,
+      "learning_rate": 8.77550894268212e-06,
+      "loss": 0.5108,
+      "step": 4892
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.36296029776689936,
+      "learning_rate": 8.751936484712343e-06,
+      "loss": 0.5212,
+      "step": 4893
+    },
+    {
+      "epoch": 0.8700444444444444,
+      "grad_norm": 0.349799802592816,
+      "learning_rate": 8.728394280175812e-06,
+      "loss": 0.585,
+      "step": 4894
+    },
+    {
+      "epoch": 0.8702222222222222,
+      "grad_norm": 0.3568142073863479,
+      "learning_rate": 8.704882336877962e-06,
+      "loss": 0.5437,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3608429759785507,
+      "learning_rate": 8.681400662614225e-06,
+      "loss": 0.5406,
+      "step": 4896
+    },
+    {
+      "epoch": 0.8705777777777778,
+      "grad_norm": 0.3566866548339779,
+      "learning_rate": 8.657949265169984e-06,
+      "loss": 0.6063,
+      "step": 4897
+    },
+    {
+      "epoch": 0.8707555555555555,
+      "grad_norm": 0.3419606827452609,
+      "learning_rate": 8.634528152320598e-06,
+      "loss": 0.5456,
+      "step": 4898
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.3383960381343903,
+      "learning_rate": 8.611137331831331e-06,
+      "loss": 0.5727,
+      "step": 4899
+    },
+    {
+      "epoch": 0.8711111111111111,
+      "grad_norm": 0.357649197456501,
+      "learning_rate": 8.587776811457505e-06,
+      "loss": 0.565,
+      "step": 4900
+    },
+    {
+      "epoch": 0.8712888888888889,
+      "grad_norm": 0.347940600328016,
+      "learning_rate": 8.564446598944276e-06,
+      "loss": 0.5898,
+      "step": 4901
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.34746348166072344,
+      "learning_rate": 8.541146702026859e-06,
+      "loss": 0.5326,
+      "step": 4902
+    },
+    {
+      "epoch": 0.8716444444444444,
+      "grad_norm": 0.35249845515358874,
+      "learning_rate": 8.51787712843033e-06,
+      "loss": 0.557,
+      "step": 4903
+    },
+    {
+      "epoch": 0.8718222222222223,
+      "grad_norm": 0.3526640751948099,
+      "learning_rate": 8.494637885869794e-06,
+      "loss": 0.5222,
+      "step": 4904
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3810496935385654,
+      "learning_rate": 8.4714289820502e-06,
+      "loss": 0.6027,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8721777777777778,
+      "grad_norm": 0.3429711964810892,
+      "learning_rate": 8.44825042466657e-06,
+      "loss": 0.5641,
+      "step": 4906
+    },
+    {
+      "epoch": 0.8723555555555556,
+      "grad_norm": 0.3552161266510762,
+      "learning_rate": 8.425102221403725e-06,
+      "loss": 0.5364,
+      "step": 4907
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.3648316144392625,
+      "learning_rate": 8.401984379936523e-06,
+      "loss": 0.5722,
+      "step": 4908
+    },
+    {
+      "epoch": 0.8727111111111111,
+      "grad_norm": 0.36175111503660295,
+      "learning_rate": 8.37889690792969e-06,
+      "loss": 0.534,
+      "step": 4909
+    },
+    {
+      "epoch": 0.8728888888888889,
+      "grad_norm": 0.3464947372391652,
+      "learning_rate": 8.355839813037936e-06,
+      "loss": 0.564,
+      "step": 4910
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.34586843533029377,
+      "learning_rate": 8.332813102905868e-06,
+      "loss": 0.5159,
+      "step": 4911
+    },
+    {
+      "epoch": 0.8732444444444445,
+      "grad_norm": 0.3368854651890267,
+      "learning_rate": 8.309816785168034e-06,
+      "loss": 0.5341,
+      "step": 4912
+    },
+    {
+      "epoch": 0.8734222222222222,
+      "grad_norm": 0.440490177864888,
+      "learning_rate": 8.286850867448881e-06,
+      "loss": 0.5649,
+      "step": 4913
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.36037561290838116,
+      "learning_rate": 8.263915357362806e-06,
+      "loss": 0.5651,
+      "step": 4914
+    },
+    {
+      "epoch": 0.8737777777777778,
+      "grad_norm": 0.3596925635758404,
+      "learning_rate": 8.241010262514115e-06,
+      "loss": 0.551,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8739555555555556,
+      "grad_norm": 0.40025486416109063,
+      "learning_rate": 8.218135590497023e-06,
+      "loss": 0.6128,
+      "step": 4916
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.3644076165648525,
+      "learning_rate": 8.19529134889565e-06,
+      "loss": 0.5695,
+      "step": 4917
+    },
+    {
+      "epoch": 0.8743111111111111,
+      "grad_norm": 0.3553574029414163,
+      "learning_rate": 8.172477545284052e-06,
+      "loss": 0.5573,
+      "step": 4918
+    },
+    {
+      "epoch": 0.8744888888888889,
+      "grad_norm": 0.3388988173482163,
+      "learning_rate": 8.149694187226187e-06,
+      "loss": 0.5336,
+      "step": 4919
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.3532443468499122,
+      "learning_rate": 8.12694128227589e-06,
+      "loss": 0.5565,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8748444444444444,
+      "grad_norm": 0.37914642324948045,
+      "learning_rate": 8.10421883797694e-06,
+      "loss": 0.5475,
+      "step": 4921
+    },
+    {
+      "epoch": 0.8750222222222223,
+      "grad_norm": 0.3658351988938684,
+      "learning_rate": 8.081526861863008e-06,
+      "loss": 0.5268,
+      "step": 4922
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.35017344271945894,
+      "learning_rate": 8.058865361457601e-06,
+      "loss": 0.5732,
+      "step": 4923
+    },
+    {
+      "epoch": 0.8753777777777778,
+      "grad_norm": 0.37178827768644324,
+      "learning_rate": 8.03623434427424e-06,
+      "loss": 0.582,
+      "step": 4924
+    },
+    {
+      "epoch": 0.8755555555555555,
+      "grad_norm": 0.39488935566244054,
+      "learning_rate": 8.013633817816202e-06,
+      "loss": 0.5927,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.3343456484834358,
+      "learning_rate": 7.991063789576814e-06,
+      "loss": 0.5339,
+      "step": 4926
+    },
+    {
+      "epoch": 0.8759111111111111,
+      "grad_norm": 0.34469056161351913,
+      "learning_rate": 7.9685242670391e-06,
+      "loss": 0.5639,
+      "step": 4927
+    },
+    {
+      "epoch": 0.8760888888888889,
+      "grad_norm": 0.3601018622991615,
+      "learning_rate": 7.946015257676177e-06,
+      "loss": 0.5383,
+      "step": 4928
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.34341839122525364,
+      "learning_rate": 7.923536768950856e-06,
+      "loss": 0.5564,
+      "step": 4929
+    },
+    {
+      "epoch": 0.8764444444444445,
+      "grad_norm": 0.351305706174665,
+      "learning_rate": 7.901088808315971e-06,
+      "loss": 0.5291,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8766222222222222,
+      "grad_norm": 0.3920813562978685,
+      "learning_rate": 7.878671383214153e-06,
+      "loss": 0.6173,
+      "step": 4931
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3532992199156299,
+      "learning_rate": 7.856284501077926e-06,
+      "loss": 0.6039,
+      "step": 4932
+    },
+    {
+      "epoch": 0.8769777777777777,
+      "grad_norm": 0.3935488579757422,
+      "learning_rate": 7.833928169329695e-06,
+      "loss": 0.5772,
+      "step": 4933
+    },
+    {
+      "epoch": 0.8771555555555556,
+      "grad_norm": 0.40192098740588955,
+      "learning_rate": 7.811602395381756e-06,
+      "loss": 0.5287,
+      "step": 4934
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.3586012157960648,
+      "learning_rate": 7.789307186636242e-06,
+      "loss": 0.5664,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8775111111111111,
+      "grad_norm": 0.3595743228683377,
+      "learning_rate": 7.76704255048516e-06,
+      "loss": 0.5746,
+      "step": 4936
+    },
+    {
+      "epoch": 0.8776888888888889,
+      "grad_norm": 0.37029370994889327,
+      "learning_rate": 7.744808494310386e-06,
+      "loss": 0.5313,
+      "step": 4937
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.4086261730048563,
+      "learning_rate": 7.722605025483654e-06,
+      "loss": 0.5068,
+      "step": 4938
+    },
+    {
+      "epoch": 0.8780444444444444,
+      "grad_norm": 0.36716075857822594,
+      "learning_rate": 7.700432151366554e-06,
+      "loss": 0.5586,
+      "step": 4939
+    },
+    {
+      "epoch": 0.8782222222222222,
+      "grad_norm": 0.3510213919722963,
+      "learning_rate": 7.678289879310541e-06,
+      "loss": 0.5629,
+      "step": 4940
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.34684632866221277,
+      "learning_rate": 7.656178216656928e-06,
+      "loss": 0.5524,
+      "step": 4941
+    },
+    {
+      "epoch": 0.8785777777777778,
+      "grad_norm": 0.33615084751081836,
+      "learning_rate": 7.634097170736853e-06,
+      "loss": 0.5312,
+      "step": 4942
+    },
+    {
+      "epoch": 0.8787555555555555,
+      "grad_norm": 0.3677593601431943,
+      "learning_rate": 7.612046748871327e-06,
+      "loss": 0.5424,
+      "step": 4943
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.39760225939574145,
+      "learning_rate": 7.590026958371199e-06,
+      "loss": 0.5503,
+      "step": 4944
+    },
+    {
+      "epoch": 0.8791111111111111,
+      "grad_norm": 0.33579010089194705,
+      "learning_rate": 7.568037806537176e-06,
+      "loss": 0.5351,
+      "step": 4945
+    },
+    {
+      "epoch": 0.8792888888888889,
+      "grad_norm": 0.34114272321824585,
+      "learning_rate": 7.5460793006597806e-06,
+      "loss": 0.5664,
+      "step": 4946
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.3782973478343387,
+      "learning_rate": 7.524151448019389e-06,
+      "loss": 0.5708,
+      "step": 4947
+    },
+    {
+      "epoch": 0.8796444444444445,
+      "grad_norm": 0.3676685406761337,
+      "learning_rate": 7.50225425588621e-06,
+      "loss": 0.5453,
+      "step": 4948
+    },
+    {
+      "epoch": 0.8798222222222222,
+      "grad_norm": 0.37242916093812084,
+      "learning_rate": 7.480387731520311e-06,
+      "loss": 0.5619,
+      "step": 4949
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3677117971216192,
+      "learning_rate": 7.458551882171549e-06,
+      "loss": 0.5897,
+      "step": 4950
+    },
+    {
+      "epoch": 0.8801777777777777,
+      "grad_norm": 0.3260294521421455,
+      "learning_rate": 7.436746715079645e-06,
+      "loss": 0.5244,
+      "step": 4951
+    },
+    {
+      "epoch": 0.8803555555555556,
+      "grad_norm": 0.3509058225477963,
+      "learning_rate": 7.414972237474138e-06,
+      "loss": 0.5557,
+      "step": 4952
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.4318547104465892,
+      "learning_rate": 7.393228456574374e-06,
+      "loss": 0.5878,
+      "step": 4953
+    },
+    {
+      "epoch": 0.8807111111111111,
+      "grad_norm": 0.37706201529314576,
+      "learning_rate": 7.371515379589555e-06,
+      "loss": 0.555,
+      "step": 4954
+    },
+    {
+      "epoch": 0.8808888888888889,
+      "grad_norm": 0.3569905939695716,
+      "learning_rate": 7.349833013718666e-06,
+      "loss": 0.5427,
+      "step": 4955
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.350934165811864,
+      "learning_rate": 7.328181366150533e-06,
+      "loss": 0.5355,
+      "step": 4956
+    },
+    {
+      "epoch": 0.8812444444444445,
+      "grad_norm": 0.36792800818312366,
+      "learning_rate": 7.306560444063826e-06,
+      "loss": 0.578,
+      "step": 4957
+    },
+    {
+      "epoch": 0.8814222222222222,
+      "grad_norm": 0.4169243524920248,
+      "learning_rate": 7.284970254626922e-06,
+      "loss": 0.5509,
+      "step": 4958
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.3385980146133746,
+      "learning_rate": 7.263410804998161e-06,
+      "loss": 0.5708,
+      "step": 4959
+    },
+    {
+      "epoch": 0.8817777777777778,
+      "grad_norm": 0.34399054053671324,
+      "learning_rate": 7.2418821023255365e-06,
+      "loss": 0.5555,
+      "step": 4960
+    },
+    {
+      "epoch": 0.8819555555555556,
+      "grad_norm": 0.3648413887850772,
+      "learning_rate": 7.220384153746995e-06,
+      "loss": 0.5937,
+      "step": 4961
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.3519060596881368,
+      "learning_rate": 7.198916966390146e-06,
+      "loss": 0.5648,
+      "step": 4962
+    },
+    {
+      "epoch": 0.8823111111111112,
+      "grad_norm": 0.3711091594921909,
+      "learning_rate": 7.177480547372528e-06,
+      "loss": 0.5203,
+      "step": 4963
+    },
+    {
+      "epoch": 0.8824888888888889,
+      "grad_norm": 0.35220582447695326,
+      "learning_rate": 7.156074903801369e-06,
+      "loss": 0.6006,
+      "step": 4964
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.34443800193528507,
+      "learning_rate": 7.13470004277379e-06,
+      "loss": 0.5139,
+      "step": 4965
+    },
+    {
+      "epoch": 0.8828444444444444,
+      "grad_norm": 0.40627494362402267,
+      "learning_rate": 7.113355971376612e-06,
+      "loss": 0.5353,
+      "step": 4966
+    },
+    {
+      "epoch": 0.8830222222222223,
+      "grad_norm": 0.32832377807936153,
+      "learning_rate": 7.092042696686518e-06,
+      "loss": 0.5507,
+      "step": 4967
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.34917033308837014,
+      "learning_rate": 7.0707602257699565e-06,
+      "loss": 0.5288,
+      "step": 4968
+    },
+    {
+      "epoch": 0.8833777777777778,
+      "grad_norm": 0.3508002356312644,
+      "learning_rate": 7.0495085656831495e-06,
+      "loss": 0.5461,
+      "step": 4969
+    },
+    {
+      "epoch": 0.8835555555555555,
+      "grad_norm": 0.3677926771044768,
+      "learning_rate": 7.028287723472138e-06,
+      "loss": 0.5103,
+      "step": 4970
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.3644173035666489,
+      "learning_rate": 7.007097706172705e-06,
+      "loss": 0.5426,
+      "step": 4971
+    },
+    {
+      "epoch": 0.8839111111111111,
+      "grad_norm": 0.3531911324846641,
+      "learning_rate": 6.985938520810442e-06,
+      "loss": 0.5653,
+      "step": 4972
+    },
+    {
+      "epoch": 0.8840888888888889,
+      "grad_norm": 0.34920737352624664,
+      "learning_rate": 6.964810174400705e-06,
+      "loss": 0.525,
+      "step": 4973
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.35519717281731944,
+      "learning_rate": 6.943712673948644e-06,
+      "loss": 0.6067,
+      "step": 4974
+    },
+    {
+      "epoch": 0.8844444444444445,
+      "grad_norm": 0.3579342074915068,
+      "learning_rate": 6.922646026449142e-06,
+      "loss": 0.5639,
+      "step": 4975
+    },
+    {
+      "epoch": 0.8846222222222222,
+      "grad_norm": 0.3569204557717254,
+      "learning_rate": 6.901610238886891e-06,
+      "loss": 0.5762,
+      "step": 4976
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.36271755784208287,
+      "learning_rate": 6.880605318236344e-06,
+      "loss": 0.556,
+      "step": 4977
+    },
+    {
+      "epoch": 0.8849777777777778,
+      "grad_norm": 0.34077007678852844,
+      "learning_rate": 6.859631271461708e-06,
+      "loss": 0.5374,
+      "step": 4978
+    },
+    {
+      "epoch": 0.8851555555555556,
+      "grad_norm": 0.35792751648345417,
+      "learning_rate": 6.838688105516955e-06,
+      "loss": 0.5599,
+      "step": 4979
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.3589913668513894,
+      "learning_rate": 6.817775827345829e-06,
+      "loss": 0.6089,
+      "step": 4980
+    },
+    {
+      "epoch": 0.8855111111111111,
+      "grad_norm": 0.34451711638140914,
+      "learning_rate": 6.7968944438818404e-06,
+      "loss": 0.5874,
+      "step": 4981
+    },
+    {
+      "epoch": 0.8856888888888889,
+      "grad_norm": 0.3691697573221582,
+      "learning_rate": 6.776043962048195e-06,
+      "loss": 0.5506,
+      "step": 4982
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.3580165336381914,
+      "learning_rate": 6.755224388757974e-06,
+      "loss": 0.554,
+      "step": 4983
+    },
+    {
+      "epoch": 0.8860444444444444,
+      "grad_norm": 0.3752294721934302,
+      "learning_rate": 6.734435730913868e-06,
+      "loss": 0.5866,
+      "step": 4984
+    },
+    {
+      "epoch": 0.8862222222222222,
+      "grad_norm": 0.36201086539983857,
+      "learning_rate": 6.713677995408452e-06,
+      "loss": 0.5987,
+      "step": 4985
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3604756887631126,
+      "learning_rate": 6.692951189123919e-06,
+      "loss": 0.5511,
+      "step": 4986
+    },
+    {
+      "epoch": 0.8865777777777778,
+      "grad_norm": 0.3477636550268372,
+      "learning_rate": 6.672255318932341e-06,
+      "loss": 0.5414,
+      "step": 4987
+    },
+    {
+      "epoch": 0.8867555555555555,
+      "grad_norm": 0.3530341279912555,
+      "learning_rate": 6.651590391695395e-06,
+      "loss": 0.5888,
+      "step": 4988
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.35161616628429393,
+      "learning_rate": 6.630956414264644e-06,
+      "loss": 0.5576,
+      "step": 4989
+    },
+    {
+      "epoch": 0.8871111111111111,
+      "grad_norm": 0.4041927729714772,
+      "learning_rate": 6.61035339348125e-06,
+      "loss": 0.6343,
+      "step": 4990
+    },
+    {
+      "epoch": 0.8872888888888889,
+      "grad_norm": 0.33856019761784983,
+      "learning_rate": 6.589781336176204e-06,
+      "loss": 0.5374,
+      "step": 4991
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.3885284915670537,
+      "learning_rate": 6.569240249170206e-06,
+      "loss": 0.5794,
+      "step": 4992
+    },
+    {
+      "epoch": 0.8876444444444445,
+      "grad_norm": 0.35667795491131,
+      "learning_rate": 6.548730139273662e-06,
+      "loss": 0.5426,
+      "step": 4993
+    },
+    {
+      "epoch": 0.8878222222222222,
+      "grad_norm": 0.3638374981061401,
+      "learning_rate": 6.528251013286757e-06,
+      "loss": 0.5559,
+      "step": 4994
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3608979244833006,
+      "learning_rate": 6.507802877999369e-06,
+      "loss": 0.5468,
+      "step": 4995
+    },
+    {
+      "epoch": 0.8881777777777777,
+      "grad_norm": 0.32491895384219516,
+      "learning_rate": 6.4873857401910875e-06,
+      "loss": 0.5313,
+      "step": 4996
+    },
+    {
+      "epoch": 0.8883555555555556,
+      "grad_norm": 0.3528662823733003,
+      "learning_rate": 6.466999606631275e-06,
+      "loss": 0.5863,
+      "step": 4997
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.34073769137489074,
+      "learning_rate": 6.4466444840789674e-06,
+      "loss": 0.5222,
+      "step": 4998
+    },
+    {
+      "epoch": 0.8887111111111111,
+      "grad_norm": 0.36596257267612503,
+      "learning_rate": 6.426320379282946e-06,
+      "loss": 0.587,
+      "step": 4999
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.33637168534101186,
+      "learning_rate": 6.406027298981687e-06,
+      "loss": 0.5314,
+      "step": 5000
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.3654380980837454,
+      "learning_rate": 6.3857652499033974e-06,
+      "loss": 0.5839,
+      "step": 5001
+    },
+    {
+      "epoch": 0.8892444444444444,
+      "grad_norm": 0.3525552894867388,
+      "learning_rate": 6.365534238765991e-06,
+      "loss": 0.5226,
+      "step": 5002
+    },
+    {
+      "epoch": 0.8894222222222222,
+      "grad_norm": 0.3557091719077854,
+      "learning_rate": 6.345334272277092e-06,
+      "loss": 0.555,
+      "step": 5003
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3389981183235064,
+      "learning_rate": 6.325165357134022e-06,
+      "loss": 0.5218,
+      "step": 5004
+    },
+    {
+      "epoch": 0.8897777777777778,
+      "grad_norm": 0.36210474787081404,
+      "learning_rate": 6.3050275000238414e-06,
+      "loss": 0.535,
+      "step": 5005
+    },
+    {
+      "epoch": 0.8899555555555556,
+      "grad_norm": 0.36732324809316436,
+      "learning_rate": 6.284920707623232e-06,
+      "loss": 0.6119,
+      "step": 5006
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.34123277302421423,
+      "learning_rate": 6.264844986598695e-06,
+      "loss": 0.5506,
+      "step": 5007
+    },
+    {
+      "epoch": 0.8903111111111112,
+      "grad_norm": 0.37968768960721694,
+      "learning_rate": 6.244800343606305e-06,
+      "loss": 0.5928,
+      "step": 5008
+    },
+    {
+      "epoch": 0.8904888888888889,
+      "grad_norm": 0.34169195159398585,
+      "learning_rate": 6.22478678529197e-06,
+      "loss": 0.5624,
+      "step": 5009
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.3560548797808062,
+      "learning_rate": 6.2048043182911245e-06,
+      "loss": 0.5951,
+      "step": 5010
+    },
+    {
+      "epoch": 0.8908444444444444,
+      "grad_norm": 0.340923851394203,
+      "learning_rate": 6.18485294922907e-06,
+      "loss": 0.5401,
+      "step": 5011
+    },
+    {
+      "epoch": 0.8910222222222223,
+      "grad_norm": 0.3610705594006183,
+      "learning_rate": 6.164932684720637e-06,
+      "loss": 0.5325,
+      "step": 5012
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3466926729042091,
+      "learning_rate": 6.145043531370498e-06,
+      "loss": 0.5602,
+      "step": 5013
+    },
+    {
+      "epoch": 0.8913777777777778,
+      "grad_norm": 0.33891049189584005,
+      "learning_rate": 6.1251854957728445e-06,
+      "loss": 0.5703,
+      "step": 5014
+    },
+    {
+      "epoch": 0.8915555555555555,
+      "grad_norm": 0.36857645357246654,
+      "learning_rate": 6.105358584511733e-06,
+      "loss": 0.5749,
+      "step": 5015
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.3801085081840926,
+      "learning_rate": 6.085562804160727e-06,
+      "loss": 0.5948,
+      "step": 5016
+    },
+    {
+      "epoch": 0.8919111111111111,
+      "grad_norm": 0.3879828280407481,
+      "learning_rate": 6.065798161283187e-06,
+      "loss": 0.5884,
+      "step": 5017
+    },
+    {
+      "epoch": 0.8920888888888889,
+      "grad_norm": 0.38240214289773117,
+      "learning_rate": 6.046064662432105e-06,
+      "loss": 0.5467,
+      "step": 5018
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.33721482017331217,
+      "learning_rate": 6.026362314150136e-06,
+      "loss": 0.5289,
+      "step": 5019
+    },
+    {
+      "epoch": 0.8924444444444445,
+      "grad_norm": 0.3757898139758863,
+      "learning_rate": 6.006691122969643e-06,
+      "loss": 0.5474,
+      "step": 5020
+    },
+    {
+      "epoch": 0.8926222222222222,
+      "grad_norm": 0.35362258474386377,
+      "learning_rate": 5.987051095412632e-06,
+      "loss": 0.5597,
+      "step": 5021
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3571203511671678,
+      "learning_rate": 5.967442237990783e-06,
+      "loss": 0.5639,
+      "step": 5022
+    },
+    {
+      "epoch": 0.8929777777777778,
+      "grad_norm": 0.3623767948783689,
+      "learning_rate": 5.9478645572054406e-06,
+      "loss": 0.5674,
+      "step": 5023
+    },
+    {
+      "epoch": 0.8931555555555556,
+      "grad_norm": 0.33966115668940144,
+      "learning_rate": 5.928318059547622e-06,
+      "loss": 0.5553,
+      "step": 5024
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.3467063412290638,
+      "learning_rate": 5.908802751497988e-06,
+      "loss": 0.5701,
+      "step": 5025
+    },
+    {
+      "epoch": 0.8935111111111111,
+      "grad_norm": 0.3346566742104518,
+      "learning_rate": 5.889318639526875e-06,
+      "loss": 0.5595,
+      "step": 5026
+    },
+    {
+      "epoch": 0.8936888888888889,
+      "grad_norm": 0.4296464805878177,
+      "learning_rate": 5.869865730094271e-06,
+      "loss": 0.5616,
+      "step": 5027
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.3465972038776893,
+      "learning_rate": 5.850444029649804e-06,
+      "loss": 0.5493,
+      "step": 5028
+    },
+    {
+      "epoch": 0.8940444444444444,
+      "grad_norm": 0.3565655259205237,
+      "learning_rate": 5.831053544632803e-06,
+      "loss": 0.5464,
+      "step": 5029
+    },
+    {
+      "epoch": 0.8942222222222223,
+      "grad_norm": 0.38441586711030085,
+      "learning_rate": 5.811694281472158e-06,
+      "loss": 0.6153,
+      "step": 5030
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.34304361941421097,
+      "learning_rate": 5.792366246586511e-06,
+      "loss": 0.5074,
+      "step": 5031
+    },
+    {
+      "epoch": 0.8945777777777778,
+      "grad_norm": 0.3453592481791633,
+      "learning_rate": 5.773069446384061e-06,
+      "loss": 0.5316,
+      "step": 5032
+    },
+    {
+      "epoch": 0.8947555555555555,
+      "grad_norm": 0.3374299219049813,
+      "learning_rate": 5.753803887262743e-06,
+      "loss": 0.5288,
+      "step": 5033
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.3228364987044134,
+      "learning_rate": 5.734569575610027e-06,
+      "loss": 0.5368,
+      "step": 5034
+    },
+    {
+      "epoch": 0.8951111111111111,
+      "grad_norm": 0.35533818079732377,
+      "learning_rate": 5.715366517803123e-06,
+      "loss": 0.5438,
+      "step": 5035
+    },
+    {
+      "epoch": 0.8952888888888889,
+      "grad_norm": 0.34821543939370586,
+      "learning_rate": 5.696194720208792e-06,
+      "loss": 0.5102,
+      "step": 5036
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.32735015436309456,
+      "learning_rate": 5.677054189183517e-06,
+      "loss": 0.5559,
+      "step": 5037
+    },
+    {
+      "epoch": 0.8956444444444445,
+      "grad_norm": 0.39232035417015443,
+      "learning_rate": 5.657944931073312e-06,
+      "loss": 0.5916,
+      "step": 5038
+    },
+    {
+      "epoch": 0.8958222222222222,
+      "grad_norm": 0.3423079683123103,
+      "learning_rate": 5.63886695221395e-06,
+      "loss": 0.5595,
+      "step": 5039
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.35412465539752674,
+      "learning_rate": 5.619820258930719e-06,
+      "loss": 0.5691,
+      "step": 5040
+    },
+    {
+      "epoch": 0.8961777777777777,
+      "grad_norm": 0.36353787847930075,
+      "learning_rate": 5.600804857538588e-06,
+      "loss": 0.5709,
+      "step": 5041
+    },
+    {
+      "epoch": 0.8963555555555556,
+      "grad_norm": 0.3691614187546208,
+      "learning_rate": 5.581820754342137e-06,
+      "loss": 0.5655,
+      "step": 5042
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.375739456665465,
+      "learning_rate": 5.562867955635587e-06,
+      "loss": 0.5932,
+      "step": 5043
+    },
+    {
+      "epoch": 0.8967111111111111,
+      "grad_norm": 0.3595972612421251,
+      "learning_rate": 5.543946467702754e-06,
+      "loss": 0.5568,
+      "step": 5044
+    },
+    {
+      "epoch": 0.8968888888888888,
+      "grad_norm": 0.3767788880692,
+      "learning_rate": 5.525056296817099e-06,
+      "loss": 0.5587,
+      "step": 5045
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.34342813588366167,
+      "learning_rate": 5.506197449241679e-06,
+      "loss": 0.5368,
+      "step": 5046
+    },
+    {
+      "epoch": 0.8972444444444444,
+      "grad_norm": 0.35004629720539576,
+      "learning_rate": 5.4873699312291695e-06,
+      "loss": 0.5512,
+      "step": 5047
+    },
+    {
+      "epoch": 0.8974222222222222,
+      "grad_norm": 0.3386492511363029,
+      "learning_rate": 5.468573749021866e-06,
+      "loss": 0.518,
+      "step": 5048
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.38071086192975595,
+      "learning_rate": 5.449808908851673e-06,
+      "loss": 0.5635,
+      "step": 5049
+    },
+    {
+      "epoch": 0.8977777777777778,
+      "grad_norm": 0.36057247620962357,
+      "learning_rate": 5.431075416940101e-06,
+      "loss": 0.5088,
+      "step": 5050
+    },
+    {
+      "epoch": 0.8979555555555555,
+      "grad_norm": 0.34094275530983353,
+      "learning_rate": 5.412373279498273e-06,
+      "loss": 0.5662,
+      "step": 5051
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.3269356539111094,
+      "learning_rate": 5.393702502726905e-06,
+      "loss": 0.5343,
+      "step": 5052
+    },
+    {
+      "epoch": 0.8983111111111111,
+      "grad_norm": 0.4427265434685174,
+      "learning_rate": 5.375063092816313e-06,
+      "loss": 0.5559,
+      "step": 5053
+    },
+    {
+      "epoch": 0.8984888888888889,
+      "grad_norm": 0.3471073482109714,
+      "learning_rate": 5.356455055946441e-06,
+      "loss": 0.5647,
+      "step": 5054
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.3651122289531994,
+      "learning_rate": 5.337878398286799e-06,
+      "loss": 0.5623,
+      "step": 5055
+    },
+    {
+      "epoch": 0.8988444444444444,
+      "grad_norm": 0.3764993611257617,
+      "learning_rate": 5.319333125996495e-06,
+      "loss": 0.5717,
+      "step": 5056
+    },
+    {
+      "epoch": 0.8990222222222222,
+      "grad_norm": 0.3807105325911764,
+      "learning_rate": 5.300819245224275e-06,
+      "loss": 0.5927,
+      "step": 5057
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3675944331230521,
+      "learning_rate": 5.282336762108398e-06,
+      "loss": 0.5673,
+      "step": 5058
+    },
+    {
+      "epoch": 0.8993777777777778,
+      "grad_norm": 0.36388847045961525,
+      "learning_rate": 5.263885682776804e-06,
+      "loss": 0.5561,
+      "step": 5059
+    },
+    {
+      "epoch": 0.8995555555555556,
+      "grad_norm": 0.49477231743873035,
+      "learning_rate": 5.245466013346945e-06,
+      "loss": 0.5667,
+      "step": 5060
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.33638623314391286,
+      "learning_rate": 5.2270777599259135e-06,
+      "loss": 0.5372,
+      "step": 5061
+    },
+    {
+      "epoch": 0.8999111111111111,
+      "grad_norm": 0.3521882897829024,
+      "learning_rate": 5.208720928610333e-06,
+      "loss": 0.5535,
+      "step": 5062
+    },
+    {
+      "epoch": 0.9000888888888889,
+      "grad_norm": 0.5628962500588736,
+      "learning_rate": 5.190395525486491e-06,
+      "loss": 0.5342,
+      "step": 5063
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.47380844913950926,
+      "learning_rate": 5.172101556630149e-06,
+      "loss": 0.5004,
+      "step": 5064
+    },
+    {
+      "epoch": 0.9004444444444445,
+      "grad_norm": 0.33969452465633604,
+      "learning_rate": 5.15383902810671e-06,
+      "loss": 0.5313,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9006222222222222,
+      "grad_norm": 0.36288858740325197,
+      "learning_rate": 5.1356079459711655e-06,
+      "loss": 0.5517,
+      "step": 5066
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.3445956683850495,
+      "learning_rate": 5.1174083162680465e-06,
+      "loss": 0.5494,
+      "step": 5067
+    },
+    {
+      "epoch": 0.9009777777777778,
+      "grad_norm": 0.3400050859421058,
+      "learning_rate": 5.0992401450314584e-06,
+      "loss": 0.5579,
+      "step": 5068
+    },
+    {
+      "epoch": 0.9011555555555556,
+      "grad_norm": 0.34723176501871816,
+      "learning_rate": 5.0811034382850955e-06,
+      "loss": 0.5809,
+      "step": 5069
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.36678676466026955,
+      "learning_rate": 5.062998202042213e-06,
+      "loss": 0.575,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9015111111111112,
+      "grad_norm": 0.3548915445767976,
+      "learning_rate": 5.044924442305621e-06,
+      "loss": 0.5429,
+      "step": 5071
+    },
+    {
+      "epoch": 0.9016888888888889,
+      "grad_norm": 0.34639744607076345,
+      "learning_rate": 5.026882165067703e-06,
+      "loss": 0.5578,
+      "step": 5072
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.3574891053512251,
+      "learning_rate": 5.008871376310409e-06,
+      "loss": 0.527,
+      "step": 5073
+    },
+    {
+      "epoch": 0.9020444444444444,
+      "grad_norm": 0.36545262702333997,
+      "learning_rate": 4.99089208200525e-06,
+      "loss": 0.5702,
+      "step": 5074
+    },
+    {
+      "epoch": 0.9022222222222223,
+      "grad_norm": 0.34434599716911823,
+      "learning_rate": 4.972944288113268e-06,
+      "loss": 0.5271,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 2.2598719635417917,
+      "learning_rate": 4.955028000585094e-06,
+      "loss": 0.5426,
+      "step": 5076
+    },
+    {
+      "epoch": 0.9025777777777778,
+      "grad_norm": 0.358430518318424,
+      "learning_rate": 4.937143225360896e-06,
+      "loss": 0.5485,
+      "step": 5077
+    },
+    {
+      "epoch": 0.9027555555555555,
+      "grad_norm": 0.34452523243508193,
+      "learning_rate": 4.9192899683703996e-06,
+      "loss": 0.5737,
+      "step": 5078
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.3750587258150285,
+      "learning_rate": 4.901468235532902e-06,
+      "loss": 0.5624,
+      "step": 5079
+    },
+    {
+      "epoch": 0.9031111111111111,
+      "grad_norm": 0.36812721003704635,
+      "learning_rate": 4.8836780327571664e-06,
+      "loss": 0.5759,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9032888888888889,
+      "grad_norm": 0.3732436350062708,
+      "learning_rate": 4.865919365941629e-06,
+      "loss": 0.5814,
+      "step": 5081
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.35324019305978754,
+      "learning_rate": 4.8481922409741474e-06,
+      "loss": 0.5413,
+      "step": 5082
+    },
+    {
+      "epoch": 0.9036444444444445,
+      "grad_norm": 0.3618793458163752,
+      "learning_rate": 4.830496663732231e-06,
+      "loss": 0.5412,
+      "step": 5083
+    },
+    {
+      "epoch": 0.9038222222222222,
+      "grad_norm": 0.37846797630893925,
+      "learning_rate": 4.812832640082809e-06,
+      "loss": 0.5489,
+      "step": 5084
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.34356591369838496,
+      "learning_rate": 4.795200175882486e-06,
+      "loss": 0.518,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9041777777777777,
+      "grad_norm": 0.3508628832012701,
+      "learning_rate": 4.777599276977263e-06,
+      "loss": 0.5753,
+      "step": 5086
+    },
+    {
+      "epoch": 0.9043555555555556,
+      "grad_norm": 0.3452490825715901,
+      "learning_rate": 4.7600299492028155e-06,
+      "loss": 0.5501,
+      "step": 5087
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.36972176015346464,
+      "learning_rate": 4.74249219838423e-06,
+      "loss": 0.5615,
+      "step": 5088
+    },
+    {
+      "epoch": 0.9047111111111111,
+      "grad_norm": 0.3941148509553186,
+      "learning_rate": 4.7249860303361755e-06,
+      "loss": 0.6022,
+      "step": 5089
+    },
+    {
+      "epoch": 0.9048888888888889,
+      "grad_norm": 0.3220473714961397,
+      "learning_rate": 4.7075114508628785e-06,
+      "loss": 0.5456,
+      "step": 5090
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.34631849756345634,
+      "learning_rate": 4.690068465758035e-06,
+      "loss": 0.5409,
+      "step": 5091
+    },
+    {
+      "epoch": 0.9052444444444444,
+      "grad_norm": 0.3571419750704605,
+      "learning_rate": 4.6726570808049095e-06,
+      "loss": 0.5349,
+      "step": 5092
+    },
+    {
+      "epoch": 0.9054222222222222,
+      "grad_norm": 0.377215705994575,
+      "learning_rate": 4.6552773017762615e-06,
+      "loss": 0.5701,
+      "step": 5093
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.33026636751756705,
+      "learning_rate": 4.637929134434393e-06,
+      "loss": 0.5185,
+      "step": 5094
+    },
+    {
+      "epoch": 0.9057777777777778,
+      "grad_norm": 0.34404645691890035,
+      "learning_rate": 4.620612584531103e-06,
+      "loss": 0.4857,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9059555555555555,
+      "grad_norm": 0.3636931261712703,
+      "learning_rate": 4.603327657807733e-06,
+      "loss": 0.5379,
+      "step": 5096
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.43522914914875505,
+      "learning_rate": 4.586074359995119e-06,
+      "loss": 0.5644,
+      "step": 5097
+    },
+    {
+      "epoch": 0.9063111111111111,
+      "grad_norm": 0.33286835169108125,
+      "learning_rate": 4.568852696813619e-06,
+      "loss": 0.525,
+      "step": 5098
+    },
+    {
+      "epoch": 0.9064888888888889,
+      "grad_norm": 0.35748639930805165,
+      "learning_rate": 4.551662673973101e-06,
+      "loss": 0.5726,
+      "step": 5099
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.3639371884789815,
+      "learning_rate": 4.534504297172948e-06,
+      "loss": 0.595,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9068444444444445,
+      "grad_norm": 0.3693780307815125,
+      "learning_rate": 4.517377572102044e-06,
+      "loss": 0.6004,
+      "step": 5101
+    },
+    {
+      "epoch": 0.9070222222222222,
+      "grad_norm": 0.47298505997685597,
+      "learning_rate": 4.500282504438769e-06,
+      "loss": 0.5805,
+      "step": 5102
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3657160369621279,
+      "learning_rate": 4.483219099851044e-06,
+      "loss": 0.6041,
+      "step": 5103
+    },
+    {
+      "epoch": 0.9073777777777777,
+      "grad_norm": 0.3452471411735382,
+      "learning_rate": 4.466187363996232e-06,
+      "loss": 0.5203,
+      "step": 5104
+    },
+    {
+      "epoch": 0.9075555555555556,
+      "grad_norm": 0.35058032752439017,
+      "learning_rate": 4.449187302521263e-06,
+      "loss": 0.5357,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.3677682480853437,
+      "learning_rate": 4.4322189210625034e-06,
+      "loss": 0.5266,
+      "step": 5106
+    },
+    {
+      "epoch": 0.9079111111111111,
+      "grad_norm": 0.3302307961448783,
+      "learning_rate": 4.415282225245887e-06,
+      "loss": 0.5327,
+      "step": 5107
+    },
+    {
+      "epoch": 0.9080888888888888,
+      "grad_norm": 0.36411783954271787,
+      "learning_rate": 4.398377220686745e-06,
+      "loss": 0.5944,
+      "step": 5108
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.3589049128132145,
+      "learning_rate": 4.381503912990015e-06,
+      "loss": 0.5704,
+      "step": 5109
+    },
+    {
+      "epoch": 0.9084444444444445,
+      "grad_norm": 0.3597884333599353,
+      "learning_rate": 4.364662307750012e-06,
+      "loss": 0.5703,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9086222222222222,
+      "grad_norm": 0.35557627346990905,
+      "learning_rate": 4.347852410550645e-06,
+      "loss": 0.5376,
+      "step": 5111
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.35112168032411384,
+      "learning_rate": 4.331074226965226e-06,
+      "loss": 0.5384,
+      "step": 5112
+    },
+    {
+      "epoch": 0.9089777777777778,
+      "grad_norm": 0.36962589897003206,
+      "learning_rate": 4.314327762556624e-06,
+      "loss": 0.5645,
+      "step": 5113
+    },
+    {
+      "epoch": 0.9091555555555556,
+      "grad_norm": 0.3829102262889725,
+      "learning_rate": 4.297613022877111e-06,
+      "loss": 0.5597,
+      "step": 5114
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.3717325359685093,
+      "learning_rate": 4.2809300134685095e-06,
+      "loss": 0.5557,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9095111111111112,
+      "grad_norm": 0.35710985862670386,
+      "learning_rate": 4.264278739862093e-06,
+      "loss": 0.5419,
+      "step": 5116
+    },
+    {
+      "epoch": 0.9096888888888889,
+      "grad_norm": 0.346335970137347,
+      "learning_rate": 4.247659207578614e-06,
+      "loss": 0.5112,
+      "step": 5117
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.3609406995265904,
+      "learning_rate": 4.231071422128308e-06,
+      "loss": 0.5641,
+      "step": 5118
+    },
+    {
+      "epoch": 0.9100444444444444,
+      "grad_norm": 0.35658533460421904,
+      "learning_rate": 4.214515389010865e-06,
+      "loss": 0.5114,
+      "step": 5119
+    },
+    {
+      "epoch": 0.9102222222222223,
+      "grad_norm": 0.3283001955079264,
+      "learning_rate": 4.1979911137154825e-06,
+      "loss": 0.5299,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.3793467949510737,
+      "learning_rate": 4.181498601720801e-06,
+      "loss": 0.5586,
+      "step": 5121
+    },
+    {
+      "epoch": 0.9105777777777778,
+      "grad_norm": 0.36781059822887613,
+      "learning_rate": 4.165037858494936e-06,
+      "loss": 0.5896,
+      "step": 5122
+    },
+    {
+      "epoch": 0.9107555555555555,
+      "grad_norm": 0.362023923353087,
+      "learning_rate": 4.148608889495475e-06,
+      "loss": 0.5894,
+      "step": 5123
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.350305332968479,
+      "learning_rate": 4.132211700169464e-06,
+      "loss": 0.5467,
+      "step": 5124
+    },
+    {
+      "epoch": 0.9111111111111111,
+      "grad_norm": 0.36825412032578164,
+      "learning_rate": 4.115846295953418e-06,
+      "loss": 0.5354,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9112888888888889,
+      "grad_norm": 0.3579120497706332,
+      "learning_rate": 4.09951268227331e-06,
+      "loss": 0.5035,
+      "step": 5126
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.39524139650094203,
+      "learning_rate": 4.083210864544584e-06,
+      "loss": 0.5938,
+      "step": 5127
+    },
+    {
+      "epoch": 0.9116444444444445,
+      "grad_norm": 0.35276759412639686,
+      "learning_rate": 4.066940848172107e-06,
+      "loss": 0.5365,
+      "step": 5128
+    },
+    {
+      "epoch": 0.9118222222222222,
+      "grad_norm": 0.37276027967182973,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.5737,
+      "step": 5129
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3726299724967845,
+      "learning_rate": 4.034496241062824e-06,
+      "loss": 0.5705,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9121777777777778,
+      "grad_norm": 0.38400336089017784,
+      "learning_rate": 4.01832166108308e-06,
+      "loss": 0.6242,
+      "step": 5131
+    },
+    {
+      "epoch": 0.9123555555555556,
+      "grad_norm": 0.36829262590511874,
+      "learning_rate": 4.002178903973674e-06,
+      "loss": 0.5993,
+      "step": 5132
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.36182386564401386,
+      "learning_rate": 3.986067975086838e-06,
+      "loss": 0.5526,
+      "step": 5133
+    },
+    {
+      "epoch": 0.9127111111111111,
+      "grad_norm": 0.37574686746649544,
+      "learning_rate": 3.9699888797641195e-06,
+      "loss": 0.5306,
+      "step": 5134
+    },
+    {
+      "epoch": 0.9128888888888889,
+      "grad_norm": 0.36029371274709715,
+      "learning_rate": 3.95394162333661e-06,
+      "loss": 0.5519,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.3493479019354734,
+      "learning_rate": 3.937926211124743e-06,
+      "loss": 0.5238,
+      "step": 5136
+    },
+    {
+      "epoch": 0.9132444444444444,
+      "grad_norm": 0.35914805914076947,
+      "learning_rate": 3.921942648438526e-06,
+      "loss": 0.5664,
+      "step": 5137
+    },
+    {
+      "epoch": 0.9134222222222222,
+      "grad_norm": 0.35601899050295577,
+      "learning_rate": 3.905990940577275e-06,
+      "loss": 0.5546,
+      "step": 5138
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.3422747728252129,
+      "learning_rate": 3.890071092829828e-06,
+      "loss": 0.5492,
+      "step": 5139
+    },
+    {
+      "epoch": 0.9137777777777778,
+      "grad_norm": 0.3520173603898526,
+      "learning_rate": 3.8741831104744274e-06,
+      "loss": 0.5019,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9139555555555555,
+      "grad_norm": 0.3461354557766037,
+      "learning_rate": 3.858326998778761e-06,
+      "loss": 0.5447,
+      "step": 5141
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.4429906638018704,
+      "learning_rate": 3.842502762999944e-06,
+      "loss": 0.5964,
+      "step": 5142
+    },
+    {
+      "epoch": 0.9143111111111111,
+      "grad_norm": 0.519003317711788,
+      "learning_rate": 3.8267104083845265e-06,
+      "loss": 0.5187,
+      "step": 5143
+    },
+    {
+      "epoch": 0.9144888888888889,
+      "grad_norm": 0.48031557176649636,
+      "learning_rate": 3.8109499401684847e-06,
+      "loss": 0.5771,
+      "step": 5144
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.48631790121650526,
+      "learning_rate": 3.795221363577239e-06,
+      "loss": 0.5368,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9148444444444445,
+      "grad_norm": 0.39271895500917803,
+      "learning_rate": 3.7795246838256084e-06,
+      "loss": 0.5386,
+      "step": 5146
+    },
+    {
+      "epoch": 0.9150222222222222,
+      "grad_norm": 0.34028015067040746,
+      "learning_rate": 3.7638599061178504e-06,
+      "loss": 0.5218,
+      "step": 5147
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3507908403602419,
+      "learning_rate": 3.7482270356476557e-06,
+      "loss": 0.5986,
+      "step": 5148
+    },
+    {
+      "epoch": 0.9153777777777777,
+      "grad_norm": 0.35320509302855646,
+      "learning_rate": 3.7326260775981227e-06,
+      "loss": 0.5719,
+      "step": 5149
+    },
+    {
+      "epoch": 0.9155555555555556,
+      "grad_norm": 0.35151463145916706,
+      "learning_rate": 3.717057037141769e-06,
+      "loss": 0.543,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.3485543149978255,
+      "learning_rate": 3.7015199194405325e-06,
+      "loss": 0.5789,
+      "step": 5151
+    },
+    {
+      "epoch": 0.9159111111111111,
+      "grad_norm": 0.3681799012171259,
+      "learning_rate": 3.6860147296457816e-06,
+      "loss": 0.5334,
+      "step": 5152
+    },
+    {
+      "epoch": 0.9160888888888888,
+      "grad_norm": 0.3614557773301528,
+      "learning_rate": 3.67054147289827e-06,
+      "loss": 0.5648,
+      "step": 5153
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.3692548576572285,
+      "learning_rate": 3.6551001543281726e-06,
+      "loss": 0.5419,
+      "step": 5154
+    },
+    {
+      "epoch": 0.9164444444444444,
+      "grad_norm": 0.34956135439933617,
+      "learning_rate": 3.639690779055116e-06,
+      "loss": 0.5324,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9166222222222222,
+      "grad_norm": 0.3452128245231248,
+      "learning_rate": 3.6243133521880577e-06,
+      "loss": 0.5241,
+      "step": 5156
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.3590939412493103,
+      "learning_rate": 3.6089678788254423e-06,
+      "loss": 0.5267,
+      "step": 5157
+    },
+    {
+      "epoch": 0.9169777777777778,
+      "grad_norm": 0.37138443221436024,
+      "learning_rate": 3.5936543640550547e-06,
+      "loss": 0.6133,
+      "step": 5158
+    },
+    {
+      "epoch": 0.9171555555555555,
+      "grad_norm": 0.331071153420671,
+      "learning_rate": 3.578372812954156e-06,
+      "loss": 0.5803,
+      "step": 5159
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.3542555192315465,
+      "learning_rate": 3.5631232305893046e-06,
+      "loss": 0.5239,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9175111111111112,
+      "grad_norm": 0.3378798234336313,
+      "learning_rate": 3.547905622016601e-06,
+      "loss": 0.572,
+      "step": 5161
+    },
+    {
+      "epoch": 0.9176888888888889,
+      "grad_norm": 0.37154754473273177,
+      "learning_rate": 3.532719992281397e-06,
+      "loss": 0.5873,
+      "step": 5162
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.36367586620061826,
+      "learning_rate": 3.5175663464185436e-06,
+      "loss": 0.5447,
+      "step": 5163
+    },
+    {
+      "epoch": 0.9180444444444444,
+      "grad_norm": 0.3480227928922605,
+      "learning_rate": 3.5024446894522554e-06,
+      "loss": 0.5663,
+      "step": 5164
+    },
+    {
+      "epoch": 0.9182222222222223,
+      "grad_norm": 0.3470123907410669,
+      "learning_rate": 3.487355026396133e-06,
+      "loss": 0.5605,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.34286031286193924,
+      "learning_rate": 3.472297362253174e-06,
+      "loss": 0.582,
+      "step": 5166
+    },
+    {
+      "epoch": 0.9185777777777778,
+      "grad_norm": 0.3764571533169989,
+      "learning_rate": 3.4572717020157853e-06,
+      "loss": 0.5645,
+      "step": 5167
+    },
+    {
+      "epoch": 0.9187555555555555,
+      "grad_norm": 0.3541064327410212,
+      "learning_rate": 3.442278050665726e-06,
+      "loss": 0.566,
+      "step": 5168
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.3486433028154304,
+      "learning_rate": 3.4273164131741753e-06,
+      "loss": 0.5342,
+      "step": 5169
+    },
+    {
+      "epoch": 0.9191111111111111,
+      "grad_norm": 0.355460177148592,
+      "learning_rate": 3.4123867945016983e-06,
+      "loss": 0.53,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9192888888888889,
+      "grad_norm": 0.33789411676419545,
+      "learning_rate": 3.3974891995982026e-06,
+      "loss": 0.5335,
+      "step": 5171
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.4733886870054763,
+      "learning_rate": 3.382623633403037e-06,
+      "loss": 0.5522,
+      "step": 5172
+    },
+    {
+      "epoch": 0.9196444444444445,
+      "grad_norm": 0.391565506036996,
+      "learning_rate": 3.367790100844892e-06,
+      "loss": 0.5526,
+      "step": 5173
+    },
+    {
+      "epoch": 0.9198222222222222,
+      "grad_norm": 0.35650829502658204,
+      "learning_rate": 3.3529886068418447e-06,
+      "loss": 0.5475,
+      "step": 5174
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3475378732928639,
+      "learning_rate": 3.3382191563013588e-06,
+      "loss": 0.5895,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9201777777777778,
+      "grad_norm": 0.35491100687567145,
+      "learning_rate": 3.323481754120261e-06,
+      "loss": 0.5893,
+      "step": 5176
+    },
+    {
+      "epoch": 0.9203555555555556,
+      "grad_norm": 0.3355598005643602,
+      "learning_rate": 3.308776405184777e-06,
+      "loss": 0.5385,
+      "step": 5177
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.34152425953551774,
+      "learning_rate": 3.2941031143704503e-06,
+      "loss": 0.5859,
+      "step": 5178
+    },
+    {
+      "epoch": 0.9207111111111111,
+      "grad_norm": 0.35613888434743146,
+      "learning_rate": 3.2794618865422677e-06,
+      "loss": 0.556,
+      "step": 5179
+    },
+    {
+      "epoch": 0.9208888888888889,
+      "grad_norm": 0.35494124210847294,
+      "learning_rate": 3.264852726554535e-06,
+      "loss": 0.5477,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.3571771768448111,
+      "learning_rate": 3.250275639250955e-06,
+      "loss": 0.5715,
+      "step": 5181
+    },
+    {
+      "epoch": 0.9212444444444444,
+      "grad_norm": 0.3601177771043013,
+      "learning_rate": 3.235730629464551e-06,
+      "loss": 0.5803,
+      "step": 5182
+    },
+    {
+      "epoch": 0.9214222222222223,
+      "grad_norm": 0.37321635044932994,
+      "learning_rate": 3.221217702017787e-06,
+      "loss": 0.5376,
+      "step": 5183
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3619891260147188,
+      "learning_rate": 3.2067368617223924e-06,
+      "loss": 0.559,
+      "step": 5184
+    },
+    {
+      "epoch": 0.9217777777777778,
+      "grad_norm": 0.37928917539145074,
+      "learning_rate": 3.1922881133795825e-06,
+      "loss": 0.5685,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9219555555555555,
+      "grad_norm": 0.380841204242253,
+      "learning_rate": 3.177871461779791e-06,
+      "loss": 0.5712,
+      "step": 5186
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.3635877216519205,
+      "learning_rate": 3.163486911702929e-06,
+      "loss": 0.5689,
+      "step": 5187
+    },
+    {
+      "epoch": 0.9223111111111111,
+      "grad_norm": 0.4138488951733727,
+      "learning_rate": 3.149134467918191e-06,
+      "loss": 0.5978,
+      "step": 5188
+    },
+    {
+      "epoch": 0.9224888888888889,
+      "grad_norm": 0.34140945563192293,
+      "learning_rate": 3.134814135184161e-06,
+      "loss": 0.5212,
+      "step": 5189
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.34702298805572057,
+      "learning_rate": 3.1205259182487624e-06,
+      "loss": 0.5633,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9228444444444445,
+      "grad_norm": 0.39947010219578805,
+      "learning_rate": 3.1062698218492724e-06,
+      "loss": 0.592,
+      "step": 5191
+    },
+    {
+      "epoch": 0.9230222222222222,
+      "grad_norm": 0.3318201473187928,
+      "learning_rate": 3.092045850712333e-06,
+      "loss": 0.4932,
+      "step": 5192
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.35191289602622544,
+      "learning_rate": 3.0778540095539156e-06,
+      "loss": 0.5549,
+      "step": 5193
+    },
+    {
+      "epoch": 0.9233777777777777,
+      "grad_norm": 0.3311251479849193,
+      "learning_rate": 3.063694303079345e-06,
+      "loss": 0.5548,
+      "step": 5194
+    },
+    {
+      "epoch": 0.9235555555555556,
+      "grad_norm": 0.3879932909834961,
+      "learning_rate": 3.049566735983289e-06,
+      "loss": 0.5521,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.43230928961559584,
+      "learning_rate": 3.035471312949778e-06,
+      "loss": 0.5311,
+      "step": 5196
+    },
+    {
+      "epoch": 0.9239111111111111,
+      "grad_norm": 0.3792734534789123,
+      "learning_rate": 3.0214080386521626e-06,
+      "loss": 0.5728,
+      "step": 5197
+    },
+    {
+      "epoch": 0.9240888888888888,
+      "grad_norm": 0.35889086322788477,
+      "learning_rate": 3.0073769177531463e-06,
+      "loss": 0.522,
+      "step": 5198
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.35540181518873853,
+      "learning_rate": 2.9933779549047636e-06,
+      "loss": 0.5185,
+      "step": 5199
+    },
+    {
+      "epoch": 0.9244444444444444,
+      "grad_norm": 0.3882251010141909,
+      "learning_rate": 2.9794111547483907e-06,
+      "loss": 0.6139,
+      "step": 5200
+    },
+    {
+      "epoch": 0.9246222222222222,
+      "grad_norm": 0.3745772284413269,
+      "learning_rate": 2.9654765219147563e-06,
+      "loss": 0.5977,
+      "step": 5201
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3320636102677679,
+      "learning_rate": 2.9515740610238762e-06,
+      "loss": 0.5443,
+      "step": 5202
+    },
+    {
+      "epoch": 0.9249777777777778,
+      "grad_norm": 0.35092225519399317,
+      "learning_rate": 2.9377037766851747e-06,
+      "loss": 0.5364,
+      "step": 5203
+    },
+    {
+      "epoch": 0.9251555555555555,
+      "grad_norm": 0.7484364376340257,
+      "learning_rate": 2.9238656734973167e-06,
+      "loss": 0.5593,
+      "step": 5204
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.3602163416545684,
+      "learning_rate": 2.9100597560484e-06,
+      "loss": 0.5851,
+      "step": 5205
+    },
+    {
+      "epoch": 0.9255111111111111,
+      "grad_norm": 0.3500410601160329,
+      "learning_rate": 2.8962860289157513e-06,
+      "loss": 0.611,
+      "step": 5206
+    },
+    {
+      "epoch": 0.9256888888888889,
+      "grad_norm": 0.35074020947725,
+      "learning_rate": 2.8825444966661063e-06,
+      "loss": 0.5428,
+      "step": 5207
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.3735741411277997,
+      "learning_rate": 2.8688351638554543e-06,
+      "loss": 0.5177,
+      "step": 5208
+    },
+    {
+      "epoch": 0.9260444444444444,
+      "grad_norm": 0.35723305854311777,
+      "learning_rate": 2.8551580350291817e-06,
+      "loss": 0.5651,
+      "step": 5209
+    },
+    {
+      "epoch": 0.9262222222222222,
+      "grad_norm": 0.3559805056580754,
+      "learning_rate": 2.8415131147219276e-06,
+      "loss": 0.5723,
+      "step": 5210
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3913503257383937,
+      "learning_rate": 2.8279004074577285e-06,
+      "loss": 0.5577,
+      "step": 5211
+    },
+    {
+      "epoch": 0.9265777777777777,
+      "grad_norm": 0.37114870341059303,
+      "learning_rate": 2.8143199177498525e-06,
+      "loss": 0.5638,
+      "step": 5212
+    },
+    {
+      "epoch": 0.9267555555555556,
+      "grad_norm": 0.3584507808312592,
+      "learning_rate": 2.800771650100964e-06,
+      "loss": 0.5291,
+      "step": 5213
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.423400141656338,
+      "learning_rate": 2.7872556090029923e-06,
+      "loss": 0.5934,
+      "step": 5214
+    },
+    {
+      "epoch": 0.9271111111111111,
+      "grad_norm": 0.35068976878422503,
+      "learning_rate": 2.773771798937208e-06,
+      "loss": 0.5833,
+      "step": 5215
+    },
+    {
+      "epoch": 0.9272888888888889,
+      "grad_norm": 0.3465886756047563,
+      "learning_rate": 2.760320224374191e-06,
+      "loss": 0.5708,
+      "step": 5216
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.3448903687595693,
+      "learning_rate": 2.746900889773829e-06,
+      "loss": 0.5809,
+      "step": 5217
+    },
+    {
+      "epoch": 0.9276444444444445,
+      "grad_norm": 0.34518172097961836,
+      "learning_rate": 2.7335137995853188e-06,
+      "loss": 0.5468,
+      "step": 5218
+    },
+    {
+      "epoch": 0.9278222222222222,
+      "grad_norm": 0.4172719503894572,
+      "learning_rate": 2.7201589582471763e-06,
+      "loss": 0.5447,
+      "step": 5219
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.373077227511944,
+      "learning_rate": 2.7068363701872155e-06,
+      "loss": 0.5521,
+      "step": 5220
+    },
+    {
+      "epoch": 0.9281777777777778,
+      "grad_norm": 0.35812578476529233,
+      "learning_rate": 2.6935460398225697e-06,
+      "loss": 0.5521,
+      "step": 5221
+    },
+    {
+      "epoch": 0.9283555555555556,
+      "grad_norm": 0.41293663098170164,
+      "learning_rate": 2.6802879715596585e-06,
+      "loss": 0.5862,
+      "step": 5222
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.3697987938093545,
+      "learning_rate": 2.66706216979421e-06,
+      "loss": 0.5632,
+      "step": 5223
+    },
+    {
+      "epoch": 0.9287111111111112,
+      "grad_norm": 0.36477036202064156,
+      "learning_rate": 2.653868638911272e-06,
+      "loss": 0.5087,
+      "step": 5224
+    },
+    {
+      "epoch": 0.9288888888888889,
+      "grad_norm": 0.3717172288092181,
+      "learning_rate": 2.6407073832851682e-06,
+      "loss": 0.5755,
+      "step": 5225
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.3313904033496613,
+      "learning_rate": 2.6275784072795405e-06,
+      "loss": 0.5177,
+      "step": 5226
+    },
+    {
+      "epoch": 0.9292444444444444,
+      "grad_norm": 0.3541283506194142,
+      "learning_rate": 2.6144817152473298e-06,
+      "loss": 0.592,
+      "step": 5227
+    },
+    {
+      "epoch": 0.9294222222222223,
+      "grad_norm": 0.363546268306384,
+      "learning_rate": 2.6014173115307292e-06,
+      "loss": 0.5625,
+      "step": 5228
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.3441569867708462,
+      "learning_rate": 2.5883852004613074e-06,
+      "loss": 0.5288,
+      "step": 5229
+    },
+    {
+      "epoch": 0.9297777777777778,
+      "grad_norm": 0.3632864590675993,
+      "learning_rate": 2.57538538635983e-06,
+      "loss": 0.5287,
+      "step": 5230
+    },
+    {
+      "epoch": 0.9299555555555555,
+      "grad_norm": 0.3343882034342482,
+      "learning_rate": 2.56241787353646e-06,
+      "loss": 0.5204,
+      "step": 5231
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.35333098925624495,
+      "learning_rate": 2.549482666290537e-06,
+      "loss": 0.5661,
+      "step": 5232
+    },
+    {
+      "epoch": 0.9303111111111111,
+      "grad_norm": 0.35706572963037697,
+      "learning_rate": 2.536579768910818e-06,
+      "loss": 0.5253,
+      "step": 5233
+    },
+    {
+      "epoch": 0.9304888888888889,
+      "grad_norm": 0.3407031331535539,
+      "learning_rate": 2.523709185675205e-06,
+      "loss": 0.5645,
+      "step": 5234
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.3313501853633208,
+      "learning_rate": 2.510870920851016e-06,
+      "loss": 0.5886,
+      "step": 5235
+    },
+    {
+      "epoch": 0.9308444444444445,
+      "grad_norm": 0.3518301772257647,
+      "learning_rate": 2.4980649786947695e-06,
+      "loss": 0.505,
+      "step": 5236
+    },
+    {
+      "epoch": 0.9310222222222222,
+      "grad_norm": 0.33392710606804554,
+      "learning_rate": 2.4852913634523023e-06,
+      "loss": 0.5628,
+      "step": 5237
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3273304965567965,
+      "learning_rate": 2.472550079358715e-06,
+      "loss": 0.5505,
+      "step": 5238
+    },
+    {
+      "epoch": 0.9313777777777777,
+      "grad_norm": 0.37751051695255455,
+      "learning_rate": 2.4598411306384185e-06,
+      "loss": 0.6323,
+      "step": 5239
+    },
+    {
+      "epoch": 0.9315555555555556,
+      "grad_norm": 0.3765049607333993,
+      "learning_rate": 2.4471645215050743e-06,
+      "loss": 0.5968,
+      "step": 5240
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.32908875140069194,
+      "learning_rate": 2.434520256161632e-06,
+      "loss": 0.534,
+      "step": 5241
+    },
+    {
+      "epoch": 0.9319111111111111,
+      "grad_norm": 0.36414118701144976,
+      "learning_rate": 2.421908338800305e-06,
+      "loss": 0.608,
+      "step": 5242
+    },
+    {
+      "epoch": 0.9320888888888889,
+      "grad_norm": 0.3767999681296671,
+      "learning_rate": 2.409328773602615e-06,
+      "loss": 0.5478,
+      "step": 5243
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.35039822210530885,
+      "learning_rate": 2.3967815647393256e-06,
+      "loss": 0.5453,
+      "step": 5244
+    },
+    {
+      "epoch": 0.9324444444444444,
+      "grad_norm": 0.37747047804756695,
+      "learning_rate": 2.384266716370476e-06,
+      "loss": 0.5697,
+      "step": 5245
+    },
+    {
+      "epoch": 0.9326222222222222,
+      "grad_norm": 0.35569079514058394,
+      "learning_rate": 2.371784232645391e-06,
+      "loss": 0.595,
+      "step": 5246
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3634310013100204,
+      "learning_rate": 2.359334117702661e-06,
+      "loss": 0.5834,
+      "step": 5247
+    },
+    {
+      "epoch": 0.9329777777777778,
+      "grad_norm": 0.3526973892567933,
+      "learning_rate": 2.3469163756701273e-06,
+      "loss": 0.5702,
+      "step": 5248
+    },
+    {
+      "epoch": 0.9331555555555555,
+      "grad_norm": 0.35812201643996516,
+      "learning_rate": 2.334531010664931e-06,
+      "loss": 0.5904,
+      "step": 5249
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.35401127783065856,
+      "learning_rate": 2.322178026793431e-06,
+      "loss": 0.5568,
+      "step": 5250
+    },
+    {
+      "epoch": 0.9335111111111111,
+      "grad_norm": 0.7042918927917877,
+      "learning_rate": 2.3098574281513185e-06,
+      "loss": 0.5789,
+      "step": 5251
+    },
+    {
+      "epoch": 0.9336888888888889,
+      "grad_norm": 0.3892363301460388,
+      "learning_rate": 2.2975692188234475e-06,
+      "loss": 0.5717,
+      "step": 5252
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.3718016748977139,
+      "learning_rate": 2.2853134028840594e-06,
+      "loss": 0.5424,
+      "step": 5253
+    },
+    {
+      "epoch": 0.9340444444444445,
+      "grad_norm": 0.33675808435914883,
+      "learning_rate": 2.2730899843965257e-06,
+      "loss": 0.5762,
+      "step": 5254
+    },
+    {
+      "epoch": 0.9342222222222222,
+      "grad_norm": 0.35475447430044343,
+      "learning_rate": 2.260898967413594e-06,
+      "loss": 0.5699,
+      "step": 5255
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3303972476591452,
+      "learning_rate": 2.2487403559771636e-06,
+      "loss": 0.5031,
+      "step": 5256
+    },
+    {
+      "epoch": 0.9345777777777777,
+      "grad_norm": 0.3774733281549121,
+      "learning_rate": 2.2366141541184883e-06,
+      "loss": 0.5487,
+      "step": 5257
+    },
+    {
+      "epoch": 0.9347555555555556,
+      "grad_norm": 0.3441550969788787,
+      "learning_rate": 2.2245203658579962e-06,
+      "loss": 0.5457,
+      "step": 5258
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.35148168347118697,
+      "learning_rate": 2.212458995205413e-06,
+      "loss": 0.5394,
+      "step": 5259
+    },
+    {
+      "epoch": 0.9351111111111111,
+      "grad_norm": 0.3492774525260916,
+      "learning_rate": 2.2004300461597073e-06,
+      "loss": 0.5505,
+      "step": 5260
+    },
+    {
+      "epoch": 0.9352888888888888,
+      "grad_norm": 0.36128489264301855,
+      "learning_rate": 2.188433522709088e-06,
+      "loss": 0.5588,
+      "step": 5261
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.32867752822948326,
+      "learning_rate": 2.1764694288310184e-06,
+      "loss": 0.5477,
+      "step": 5262
+    },
+    {
+      "epoch": 0.9356444444444444,
+      "grad_norm": 0.36191190158101877,
+      "learning_rate": 2.1645377684922252e-06,
+      "loss": 0.5624,
+      "step": 5263
+    },
+    {
+      "epoch": 0.9358222222222222,
+      "grad_norm": 0.5240451145392441,
+      "learning_rate": 2.152638545648644e-06,
+      "loss": 0.5367,
+      "step": 5264
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3240433096840552,
+      "learning_rate": 2.1407717642455082e-06,
+      "loss": 0.5262,
+      "step": 5265
+    },
+    {
+      "epoch": 0.9361777777777778,
+      "grad_norm": 0.3567531938105747,
+      "learning_rate": 2.128937428217259e-06,
+      "loss": 0.5374,
+      "step": 5266
+    },
+    {
+      "epoch": 0.9363555555555556,
+      "grad_norm": 0.35008253187215066,
+      "learning_rate": 2.117135541487569e-06,
+      "loss": 0.5425,
+      "step": 5267
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.3724918942605047,
+      "learning_rate": 2.1053661079693976e-06,
+      "loss": 0.5498,
+      "step": 5268
+    },
+    {
+      "epoch": 0.9367111111111112,
+      "grad_norm": 0.3488246745219126,
+      "learning_rate": 2.0936291315649113e-06,
+      "loss": 0.5358,
+      "step": 5269
+    },
+    {
+      "epoch": 0.9368888888888889,
+      "grad_norm": 0.36840778758020337,
+      "learning_rate": 2.0819246161655092e-06,
+      "loss": 0.5752,
+      "step": 5270
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.35457885943501455,
+      "learning_rate": 2.0702525656518534e-06,
+      "loss": 0.5374,
+      "step": 5271
+    },
+    {
+      "epoch": 0.9372444444444444,
+      "grad_norm": 0.3888329421261131,
+      "learning_rate": 2.0586129838938263e-06,
+      "loss": 0.5521,
+      "step": 5272
+    },
+    {
+      "epoch": 0.9374222222222223,
+      "grad_norm": 0.32556892946070465,
+      "learning_rate": 2.0470058747505516e-06,
+      "loss": 0.5056,
+      "step": 5273
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.37746947301796263,
+      "learning_rate": 2.0354312420703847e-06,
+      "loss": 0.5492,
+      "step": 5274
+    },
+    {
+      "epoch": 0.9377777777777778,
+      "grad_norm": 0.35591614326200327,
+      "learning_rate": 2.023889089690911e-06,
+      "loss": 0.5582,
+      "step": 5275
+    },
+    {
+      "epoch": 0.9379555555555555,
+      "grad_norm": 0.3285066271862678,
+      "learning_rate": 2.012379421438937e-06,
+      "loss": 0.5329,
+      "step": 5276
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.3605098620380117,
+      "learning_rate": 2.0009022411305313e-06,
+      "loss": 0.5952,
+      "step": 5277
+    },
+    {
+      "epoch": 0.9383111111111111,
+      "grad_norm": 0.3747214465320203,
+      "learning_rate": 1.989457552570939e-06,
+      "loss": 0.5555,
+      "step": 5278
+    },
+    {
+      "epoch": 0.9384888888888889,
+      "grad_norm": 0.35790797694265153,
+      "learning_rate": 1.9780453595547145e-06,
+      "loss": 0.5608,
+      "step": 5279
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.4175122800171325,
+      "learning_rate": 1.96666566586553e-06,
+      "loss": 0.5148,
+      "step": 5280
+    },
+    {
+      "epoch": 0.9388444444444445,
+      "grad_norm": 0.34609462319658957,
+      "learning_rate": 1.955318475276391e-06,
+      "loss": 0.5959,
+      "step": 5281
+    },
+    {
+      "epoch": 0.9390222222222222,
+      "grad_norm": 0.37011007137890656,
+      "learning_rate": 1.9440037915494316e-06,
+      "loss": 0.5464,
+      "step": 5282
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.36530700661180315,
+      "learning_rate": 1.9327216184360843e-06,
+      "loss": 0.593,
+      "step": 5283
+    },
+    {
+      "epoch": 0.9393777777777778,
+      "grad_norm": 0.3432745073859673,
+      "learning_rate": 1.921471959676957e-06,
+      "loss": 0.5825,
+      "step": 5284
+    },
+    {
+      "epoch": 0.9395555555555556,
+      "grad_norm": 0.37732265708241935,
+      "learning_rate": 1.9102548190018887e-06,
+      "loss": 0.5582,
+      "step": 5285
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.3440408926572364,
+      "learning_rate": 1.89907020012996e-06,
+      "loss": 0.537,
+      "step": 5286
+    },
+    {
+      "epoch": 0.9399111111111111,
+      "grad_norm": 0.39292759798214,
+      "learning_rate": 1.887918106769415e-06,
+      "loss": 0.555,
+      "step": 5287
+    },
+    {
+      "epoch": 0.9400888888888889,
+      "grad_norm": 0.35950334473642725,
+      "learning_rate": 1.8767985426177748e-06,
+      "loss": 0.5427,
+      "step": 5288
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.3440525012612628,
+      "learning_rate": 1.865711511361734e-06,
+      "loss": 0.5185,
+      "step": 5289
+    },
+    {
+      "epoch": 0.9404444444444444,
+      "grad_norm": 0.38243832953269186,
+      "learning_rate": 1.8546570166772193e-06,
+      "loss": 0.6008,
+      "step": 5290
+    },
+    {
+      "epoch": 0.9406222222222222,
+      "grad_norm": 0.3767184537860918,
+      "learning_rate": 1.843635062229354e-06,
+      "loss": 0.5689,
+      "step": 5291
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3708024428732624,
+      "learning_rate": 1.8326456516725155e-06,
+      "loss": 0.5554,
+      "step": 5292
+    },
+    {
+      "epoch": 0.9409777777777778,
+      "grad_norm": 0.35507369216051266,
+      "learning_rate": 1.821688788650211e-06,
+      "loss": 0.5109,
+      "step": 5293
+    },
+    {
+      "epoch": 0.9411555555555555,
+      "grad_norm": 0.3511085498837757,
+      "learning_rate": 1.810764476795257e-06,
+      "loss": 0.5279,
+      "step": 5294
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.35301873355061625,
+      "learning_rate": 1.7998727197295784e-06,
+      "loss": 0.5562,
+      "step": 5295
+    },
+    {
+      "epoch": 0.9415111111111111,
+      "grad_norm": 0.3770329563922565,
+      "learning_rate": 1.7890135210643865e-06,
+      "loss": 0.5723,
+      "step": 5296
+    },
+    {
+      "epoch": 0.9416888888888889,
+      "grad_norm": 0.362297747041444,
+      "learning_rate": 1.778186884400046e-06,
+      "loss": 0.5586,
+      "step": 5297
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.33828352400251355,
+      "learning_rate": 1.767392813326163e-06,
+      "loss": 0.5137,
+      "step": 5298
+    },
+    {
+      "epoch": 0.9420444444444445,
+      "grad_norm": 0.40197056613897847,
+      "learning_rate": 1.7566313114215082e-06,
+      "loss": 0.536,
+      "step": 5299
+    },
+    {
+      "epoch": 0.9422222222222222,
+      "grad_norm": 0.3434618217131165,
+      "learning_rate": 1.7459023822540943e-06,
+      "loss": 0.5895,
+      "step": 5300
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.33019712985345034,
+      "learning_rate": 1.7352060293810868e-06,
+      "loss": 0.5477,
+      "step": 5301
+    },
+    {
+      "epoch": 0.9425777777777777,
+      "grad_norm": 0.3768755077792002,
+      "learning_rate": 1.7245422563489045e-06,
+      "loss": 0.5862,
+      "step": 5302
+    },
+    {
+      "epoch": 0.9427555555555556,
+      "grad_norm": 0.3507032336343113,
+      "learning_rate": 1.7139110666931191e-06,
+      "loss": 0.5116,
+      "step": 5303
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.3614892470305919,
+      "learning_rate": 1.7033124639385333e-06,
+      "loss": 0.5617,
+      "step": 5304
+    },
+    {
+      "epoch": 0.9431111111111111,
+      "grad_norm": 0.35151746024174557,
+      "learning_rate": 1.6927464515991142e-06,
+      "loss": 0.5633,
+      "step": 5305
+    },
+    {
+      "epoch": 0.9432888888888888,
+      "grad_norm": 0.3698253321825334,
+      "learning_rate": 1.6822130331780484e-06,
+      "loss": 0.5663,
+      "step": 5306
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.3507490856884903,
+      "learning_rate": 1.6717122121677088e-06,
+      "loss": 0.5314,
+      "step": 5307
+    },
+    {
+      "epoch": 0.9436444444444444,
+      "grad_norm": 0.33157682103988173,
+      "learning_rate": 1.6612439920496548e-06,
+      "loss": 0.5134,
+      "step": 5308
+    },
+    {
+      "epoch": 0.9438222222222222,
+      "grad_norm": 0.3450966457133679,
+      "learning_rate": 1.6508083762946324e-06,
+      "loss": 0.5132,
+      "step": 5309
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.36970095565930994,
+      "learning_rate": 1.6404053683626076e-06,
+      "loss": 0.5912,
+      "step": 5310
+    },
+    {
+      "epoch": 0.9441777777777778,
+      "grad_norm": 0.38493115738719047,
+      "learning_rate": 1.6300349717026875e-06,
+      "loss": 0.5632,
+      "step": 5311
+    },
+    {
+      "epoch": 0.9443555555555555,
+      "grad_norm": 0.37204627315981625,
+      "learning_rate": 1.619697189753211e-06,
+      "loss": 0.5559,
+      "step": 5312
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.35743393176457144,
+      "learning_rate": 1.6093920259416696e-06,
+      "loss": 0.5259,
+      "step": 5313
+    },
+    {
+      "epoch": 0.9447111111111111,
+      "grad_norm": 0.3683940186535053,
+      "learning_rate": 1.5991194836847746e-06,
+      "loss": 0.5406,
+      "step": 5314
+    },
+    {
+      "epoch": 0.9448888888888889,
+      "grad_norm": 0.37002343244193114,
+      "learning_rate": 1.5888795663883904e-06,
+      "loss": 0.5309,
+      "step": 5315
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.3483337769977269,
+      "learning_rate": 1.5786722774475793e-06,
+      "loss": 0.5918,
+      "step": 5316
+    },
+    {
+      "epoch": 0.9452444444444444,
+      "grad_norm": 0.341713628604192,
+      "learning_rate": 1.5684976202465784e-06,
+      "loss": 0.526,
+      "step": 5317
+    },
+    {
+      "epoch": 0.9454222222222223,
+      "grad_norm": 0.36775993877312124,
+      "learning_rate": 1.5583555981588338e-06,
+      "loss": 0.5721,
+      "step": 5318
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.34590965148087716,
+      "learning_rate": 1.5482462145469224e-06,
+      "loss": 0.5485,
+      "step": 5319
+    },
+    {
+      "epoch": 0.9457777777777778,
+      "grad_norm": 0.35971688745675895,
+      "learning_rate": 1.5381694727626295e-06,
+      "loss": 0.5387,
+      "step": 5320
+    },
+    {
+      "epoch": 0.9459555555555555,
+      "grad_norm": 0.35607081447358163,
+      "learning_rate": 1.5281253761469161e-06,
+      "loss": 0.5649,
+      "step": 5321
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.3510700520739768,
+      "learning_rate": 1.5181139280299295e-06,
+      "loss": 0.5671,
+      "step": 5322
+    },
+    {
+      "epoch": 0.9463111111111111,
+      "grad_norm": 0.34212333227749525,
+      "learning_rate": 1.50813513173097e-06,
+      "loss": 0.5338,
+      "step": 5323
+    },
+    {
+      "epoch": 0.9464888888888889,
+      "grad_norm": 0.3619475089093974,
+      "learning_rate": 1.4981889905585134e-06,
+      "loss": 0.5446,
+      "step": 5324
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.3381827481499597,
+      "learning_rate": 1.4882755078102327e-06,
+      "loss": 0.5357,
+      "step": 5325
+    },
+    {
+      "epoch": 0.9468444444444445,
+      "grad_norm": 0.37066267828269134,
+      "learning_rate": 1.4783946867729547e-06,
+      "loss": 0.5777,
+      "step": 5326
+    },
+    {
+      "epoch": 0.9470222222222222,
+      "grad_norm": 0.3706286998566814,
+      "learning_rate": 1.468546530722681e-06,
+      "loss": 0.5785,
+      "step": 5327
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3511841440117857,
+      "learning_rate": 1.4587310429245882e-06,
+      "loss": 0.5379,
+      "step": 5328
+    },
+    {
+      "epoch": 0.9473777777777778,
+      "grad_norm": 0.34150136023459365,
+      "learning_rate": 1.4489482266329956e-06,
+      "loss": 0.5693,
+      "step": 5329
+    },
+    {
+      "epoch": 0.9475555555555556,
+      "grad_norm": 0.36761504570399756,
+      "learning_rate": 1.4391980850914311e-06,
+      "loss": 0.5668,
+      "step": 5330
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.3423601140556568,
+      "learning_rate": 1.429480621532564e-06,
+      "loss": 0.5541,
+      "step": 5331
+    },
+    {
+      "epoch": 0.9479111111111111,
+      "grad_norm": 0.35062857338708975,
+      "learning_rate": 1.4197958391782284e-06,
+      "loss": 0.5273,
+      "step": 5332
+    },
+    {
+      "epoch": 0.9480888888888889,
+      "grad_norm": 0.3438328556286129,
+      "learning_rate": 1.4101437412394336e-06,
+      "loss": 0.5905,
+      "step": 5333
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.3473827178403967,
+      "learning_rate": 1.4005243309163418e-06,
+      "loss": 0.5686,
+      "step": 5334
+    },
+    {
+      "epoch": 0.9484444444444444,
+      "grad_norm": 0.3420838734297711,
+      "learning_rate": 1.3909376113982798e-06,
+      "loss": 0.5276,
+      "step": 5335
+    },
+    {
+      "epoch": 0.9486222222222223,
+      "grad_norm": 0.35300484746503047,
+      "learning_rate": 1.3813835858637715e-06,
+      "loss": 0.5747,
+      "step": 5336
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.34766411407479836,
+      "learning_rate": 1.3718622574804163e-06,
+      "loss": 0.546,
+      "step": 5337
+    },
+    {
+      "epoch": 0.9489777777777778,
+      "grad_norm": 0.3365621744039928,
+      "learning_rate": 1.362373629405067e-06,
+      "loss": 0.5386,
+      "step": 5338
+    },
+    {
+      "epoch": 0.9491555555555555,
+      "grad_norm": 0.34917634568600764,
+      "learning_rate": 1.3529177047836627e-06,
+      "loss": 0.5577,
+      "step": 5339
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.39029017879047967,
+      "learning_rate": 1.343494486751362e-06,
+      "loss": 0.5844,
+      "step": 5340
+    },
+    {
+      "epoch": 0.9495111111111111,
+      "grad_norm": 0.3558562257771291,
+      "learning_rate": 1.3341039784324106e-06,
+      "loss": 0.5465,
+      "step": 5341
+    },
+    {
+      "epoch": 0.9496888888888889,
+      "grad_norm": 0.35064301673146553,
+      "learning_rate": 1.3247461829402729e-06,
+      "loss": 0.4971,
+      "step": 5342
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.35849762798032986,
+      "learning_rate": 1.3154211033775344e-06,
+      "loss": 0.5518,
+      "step": 5343
+    },
+    {
+      "epoch": 0.9500444444444445,
+      "grad_norm": 0.34128929216535925,
+      "learning_rate": 1.3061287428359325e-06,
+      "loss": 0.5721,
+      "step": 5344
+    },
+    {
+      "epoch": 0.9502222222222222,
+      "grad_norm": 0.3579953394538537,
+      "learning_rate": 1.2968691043963699e-06,
+      "loss": 0.5519,
+      "step": 5345
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3708075142823777,
+      "learning_rate": 1.2876421911288905e-06,
+      "loss": 0.5986,
+      "step": 5346
+    },
+    {
+      "epoch": 0.9505777777777777,
+      "grad_norm": 0.3574258124770667,
+      "learning_rate": 1.2784480060926919e-06,
+      "loss": 0.5623,
+      "step": 5347
+    },
+    {
+      "epoch": 0.9507555555555556,
+      "grad_norm": 0.3741393739641284,
+      "learning_rate": 1.269286552336113e-06,
+      "loss": 0.5228,
+      "step": 5348
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.3350966867099206,
+      "learning_rate": 1.2601578328966578e-06,
+      "loss": 0.5701,
+      "step": 5349
+    },
+    {
+      "epoch": 0.9511111111111111,
+      "grad_norm": 0.3436924377359069,
+      "learning_rate": 1.2510618508009608e-06,
+      "loss": 0.5448,
+      "step": 5350
+    },
+    {
+      "epoch": 0.9512888888888889,
+      "grad_norm": 0.34746060247246097,
+      "learning_rate": 1.2419986090648205e-06,
+      "loss": 0.5868,
+      "step": 5351
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.36130259692645544,
+      "learning_rate": 1.2329681106931557e-06,
+      "loss": 0.5166,
+      "step": 5352
+    },
+    {
+      "epoch": 0.9516444444444444,
+      "grad_norm": 0.3537993097331322,
+      "learning_rate": 1.2239703586800378e-06,
+      "loss": 0.5582,
+      "step": 5353
+    },
+    {
+      "epoch": 0.9518222222222222,
+      "grad_norm": 0.35351759198379995,
+      "learning_rate": 1.2150053560087026e-06,
+      "loss": 0.55,
+      "step": 5354
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.36201929801838933,
+      "learning_rate": 1.2060731056514951e-06,
+      "loss": 0.5512,
+      "step": 5355
+    },
+    {
+      "epoch": 0.9521777777777778,
+      "grad_norm": 0.3339019435790876,
+      "learning_rate": 1.197173610569924e-06,
+      "loss": 0.5098,
+      "step": 5356
+    },
+    {
+      "epoch": 0.9523555555555555,
+      "grad_norm": 0.34992317516829263,
+      "learning_rate": 1.1883068737146285e-06,
+      "loss": 0.5486,
+      "step": 5357
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.40767003048670447,
+      "learning_rate": 1.1794728980253911e-06,
+      "loss": 0.5959,
+      "step": 5358
+    },
+    {
+      "epoch": 0.9527111111111111,
+      "grad_norm": 0.38108034839002447,
+      "learning_rate": 1.170671686431124e-06,
+      "loss": 0.5604,
+      "step": 5359
+    },
+    {
+      "epoch": 0.9528888888888889,
+      "grad_norm": 0.35078210040558905,
+      "learning_rate": 1.161903241849882e-06,
+      "loss": 0.5381,
+      "step": 5360
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.3374556292462107,
+      "learning_rate": 1.1531675671888619e-06,
+      "loss": 0.5258,
+      "step": 5361
+    },
+    {
+      "epoch": 0.9532444444444444,
+      "grad_norm": 0.36605471136271917,
+      "learning_rate": 1.1444646653443914e-06,
+      "loss": 0.5297,
+      "step": 5362
+    },
+    {
+      "epoch": 0.9534222222222222,
+      "grad_norm": 0.36669112939208975,
+      "learning_rate": 1.1357945392019064e-06,
+      "loss": 0.5897,
+      "step": 5363
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.35287350856307304,
+      "learning_rate": 1.1271571916360413e-06,
+      "loss": 0.5862,
+      "step": 5364
+    },
+    {
+      "epoch": 0.9537777777777777,
+      "grad_norm": 0.34656218670688343,
+      "learning_rate": 1.1185526255104938e-06,
+      "loss": 0.5632,
+      "step": 5365
+    },
+    {
+      "epoch": 0.9539555555555556,
+      "grad_norm": 0.3578602751675772,
+      "learning_rate": 1.1099808436781378e-06,
+      "loss": 0.5384,
+      "step": 5366
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.3444475857913409,
+      "learning_rate": 1.1014418489809331e-06,
+      "loss": 0.5026,
+      "step": 5367
+    },
+    {
+      "epoch": 0.9543111111111111,
+      "grad_norm": 0.3378381163817232,
+      "learning_rate": 1.092935644250026e-06,
+      "loss": 0.5372,
+      "step": 5368
+    },
+    {
+      "epoch": 0.9544888888888889,
+      "grad_norm": 0.3604527420605561,
+      "learning_rate": 1.0844622323056387e-06,
+      "loss": 0.5737,
+      "step": 5369
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.3582489023230677,
+      "learning_rate": 1.0760216159571679e-06,
+      "loss": 0.5639,
+      "step": 5370
+    },
+    {
+      "epoch": 0.9548444444444445,
+      "grad_norm": 0.34267321561581965,
+      "learning_rate": 1.0676137980030864e-06,
+      "loss": 0.5314,
+      "step": 5371
+    },
+    {
+      "epoch": 0.9550222222222222,
+      "grad_norm": 0.3544674141514693,
+      "learning_rate": 1.0592387812310311e-06,
+      "loss": 0.5389,
+      "step": 5372
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.3400652746822241,
+      "learning_rate": 1.0508965684177586e-06,
+      "loss": 0.5424,
+      "step": 5373
+    },
+    {
+      "epoch": 0.9553777777777778,
+      "grad_norm": 0.3479779916557745,
+      "learning_rate": 1.042587162329134e-06,
+      "loss": 0.5617,
+      "step": 5374
+    },
+    {
+      "epoch": 0.9555555555555556,
+      "grad_norm": 0.37529025937367,
+      "learning_rate": 1.0343105657201534e-06,
+      "loss": 0.5922,
+      "step": 5375
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.36627646800943486,
+      "learning_rate": 1.0260667813349445e-06,
+      "loss": 0.5446,
+      "step": 5376
+    },
+    {
+      "epoch": 0.9559111111111112,
+      "grad_norm": 0.3521949148309049,
+      "learning_rate": 1.0178558119067315e-06,
+      "loss": 0.5546,
+      "step": 5377
+    },
+    {
+      "epoch": 0.9560888888888889,
+      "grad_norm": 0.3477477444889299,
+      "learning_rate": 1.0096776601578705e-06,
+      "loss": 0.5468,
+      "step": 5378
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.34647139462035736,
+      "learning_rate": 1.0015323287998702e-06,
+      "loss": 0.5633,
+      "step": 5379
+    },
+    {
+      "epoch": 0.9564444444444444,
+      "grad_norm": 0.37700557480211677,
+      "learning_rate": 9.934198205332924e-07,
+      "loss": 0.5719,
+      "step": 5380
+    },
+    {
+      "epoch": 0.9566222222222223,
+      "grad_norm": 0.3616641937202924,
+      "learning_rate": 9.853401380478743e-07,
+      "loss": 0.5722,
+      "step": 5381
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3774046038469421,
+      "learning_rate": 9.772932840224292e-07,
+      "loss": 0.5725,
+      "step": 5382
+    },
+    {
+      "epoch": 0.9569777777777778,
+      "grad_norm": 0.34431839569775835,
+      "learning_rate": 9.692792611249224e-07,
+      "loss": 0.5755,
+      "step": 5383
+    },
+    {
+      "epoch": 0.9571555555555555,
+      "grad_norm": 0.3513637012455241,
+      "learning_rate": 9.612980720124065e-07,
+      "loss": 0.5193,
+      "step": 5384
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.37272470585199746,
+      "learning_rate": 9.533497193310537e-07,
+      "loss": 0.5405,
+      "step": 5385
+    },
+    {
+      "epoch": 0.9575111111111111,
+      "grad_norm": 0.3572153740789805,
+      "learning_rate": 9.454342057161558e-07,
+      "loss": 0.5659,
+      "step": 5386
+    },
+    {
+      "epoch": 0.9576888888888889,
+      "grad_norm": 0.3734913505550753,
+      "learning_rate": 9.375515337921136e-07,
+      "loss": 0.5467,
+      "step": 5387
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.3718162208776986,
+      "learning_rate": 9.297017061724367e-07,
+      "loss": 0.5842,
+      "step": 5388
+    },
+    {
+      "epoch": 0.9580444444444445,
+      "grad_norm": 0.3950648245050587,
+      "learning_rate": 9.218847254597429e-07,
+      "loss": 0.5884,
+      "step": 5389
+    },
+    {
+      "epoch": 0.9582222222222222,
+      "grad_norm": 0.3432289642551042,
+      "learning_rate": 9.141005942457814e-07,
+      "loss": 0.5358,
+      "step": 5390
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.39974622614959526,
+      "learning_rate": 9.063493151113655e-07,
+      "loss": 0.521,
+      "step": 5391
+    },
+    {
+      "epoch": 0.9585777777777778,
+      "grad_norm": 0.37147636089316044,
+      "learning_rate": 8.98630890626484e-07,
+      "loss": 0.5613,
+      "step": 5392
+    },
+    {
+      "epoch": 0.9587555555555556,
+      "grad_norm": 0.35183537377008933,
+      "learning_rate": 8.909453233501452e-07,
+      "loss": 0.5463,
+      "step": 5393
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.3921724110598052,
+      "learning_rate": 8.832926158305444e-07,
+      "loss": 0.5355,
+      "step": 5394
+    },
+    {
+      "epoch": 0.9591111111111111,
+      "grad_norm": 0.35512229464074563,
+      "learning_rate": 8.756727706049295e-07,
+      "loss": 0.5189,
+      "step": 5395
+    },
+    {
+      "epoch": 0.9592888888888889,
+      "grad_norm": 0.3991940619770938,
+      "learning_rate": 8.680857901996798e-07,
+      "loss": 0.5858,
+      "step": 5396
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.3633625172871414,
+      "learning_rate": 8.605316771302719e-07,
+      "loss": 0.5401,
+      "step": 5397
+    },
+    {
+      "epoch": 0.9596444444444444,
+      "grad_norm": 0.35451917904652025,
+      "learning_rate": 8.530104339012801e-07,
+      "loss": 0.586,
+      "step": 5398
+    },
+    {
+      "epoch": 0.9598222222222222,
+      "grad_norm": 0.3806410750706742,
+      "learning_rate": 8.455220630063764e-07,
+      "loss": 0.5293,
+      "step": 5399
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.35760426167700343,
+      "learning_rate": 8.380665669283527e-07,
+      "loss": 0.55,
+      "step": 5400
+    },
+    {
+      "epoch": 0.9601777777777778,
+      "grad_norm": 0.3679085796450376,
+      "learning_rate": 8.30643948139087e-07,
+      "loss": 0.5874,
+      "step": 5401
+    },
+    {
+      "epoch": 0.9603555555555555,
+      "grad_norm": 0.3528354724167527,
+      "learning_rate": 8.232542090995665e-07,
+      "loss": 0.5475,
+      "step": 5402
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.3616565251982588,
+      "learning_rate": 8.158973522598534e-07,
+      "loss": 0.5467,
+      "step": 5403
+    },
+    {
+      "epoch": 0.9607111111111111,
+      "grad_norm": 0.3425461512134011,
+      "learning_rate": 8.085733800591411e-07,
+      "loss": 0.5392,
+      "step": 5404
+    },
+    {
+      "epoch": 0.9608888888888889,
+      "grad_norm": 0.34906939838943474,
+      "learning_rate": 8.012822949256982e-07,
+      "loss": 0.5302,
+      "step": 5405
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.3694655899937999,
+      "learning_rate": 7.94024099276891e-07,
+      "loss": 0.5304,
+      "step": 5406
+    },
+    {
+      "epoch": 0.9612444444444445,
+      "grad_norm": 0.35474902290277344,
+      "learning_rate": 7.867987955191947e-07,
+      "loss": 0.573,
+      "step": 5407
+    },
+    {
+      "epoch": 0.9614222222222222,
+      "grad_norm": 0.35602761656784154,
+      "learning_rate": 7.796063860481595e-07,
+      "loss": 0.6127,
+      "step": 5408
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.34456720727142676,
+      "learning_rate": 7.724468732484336e-07,
+      "loss": 0.5302,
+      "step": 5409
+    },
+    {
+      "epoch": 0.9617777777777777,
+      "grad_norm": 0.3421583240788226,
+      "learning_rate": 7.653202594937848e-07,
+      "loss": 0.5596,
+      "step": 5410
+    },
+    {
+      "epoch": 0.9619555555555556,
+      "grad_norm": 0.3356554085273903,
+      "learning_rate": 7.58226547147034e-07,
+      "loss": 0.5638,
+      "step": 5411
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.37827977989320466,
+      "learning_rate": 7.511657385601223e-07,
+      "loss": 0.5751,
+      "step": 5412
+    },
+    {
+      "epoch": 0.9623111111111111,
+      "grad_norm": 0.36017266466342873,
+      "learning_rate": 7.441378360740659e-07,
+      "loss": 0.5442,
+      "step": 5413
+    },
+    {
+      "epoch": 0.9624888888888888,
+      "grad_norm": 0.35887186855440556,
+      "learning_rate": 7.371428420189896e-07,
+      "loss": 0.587,
+      "step": 5414
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.36722490455094403,
+      "learning_rate": 7.301807587140718e-07,
+      "loss": 0.5263,
+      "step": 5415
+    },
+    {
+      "epoch": 0.9628444444444444,
+      "grad_norm": 0.357107058037139,
+      "learning_rate": 7.232515884676328e-07,
+      "loss": 0.5606,
+      "step": 5416
+    },
+    {
+      "epoch": 0.9630222222222222,
+      "grad_norm": 0.3466909570255286,
+      "learning_rate": 7.163553335770123e-07,
+      "loss": 0.5609,
+      "step": 5417
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3561257401402519,
+      "learning_rate": 7.094919963287039e-07,
+      "loss": 0.5529,
+      "step": 5418
+    },
+    {
+      "epoch": 0.9633777777777778,
+      "grad_norm": 0.35009188376432077,
+      "learning_rate": 7.026615789982426e-07,
+      "loss": 0.5494,
+      "step": 5419
+    },
+    {
+      "epoch": 0.9635555555555556,
+      "grad_norm": 0.38536900595610263,
+      "learning_rate": 6.958640838502617e-07,
+      "loss": 0.5822,
+      "step": 5420
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.3401491511918444,
+      "learning_rate": 6.890995131384914e-07,
+      "loss": 0.5695,
+      "step": 5421
+    },
+    {
+      "epoch": 0.9639111111111112,
+      "grad_norm": 0.3474105035663054,
+      "learning_rate": 6.823678691057378e-07,
+      "loss": 0.531,
+      "step": 5422
+    },
+    {
+      "epoch": 0.9640888888888889,
+      "grad_norm": 0.46784446071911057,
+      "learning_rate": 6.756691539838711e-07,
+      "loss": 0.6284,
+      "step": 5423
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.3593624699289564,
+      "learning_rate": 6.690033699938703e-07,
+      "loss": 0.565,
+      "step": 5424
+    },
+    {
+      "epoch": 0.9644444444444444,
+      "grad_norm": 0.36142721943378503,
+      "learning_rate": 6.623705193457897e-07,
+      "loss": 0.581,
+      "step": 5425
+    },
+    {
+      "epoch": 0.9646222222222223,
+      "grad_norm": 0.3502868676050175,
+      "learning_rate": 6.557706042387479e-07,
+      "loss": 0.5577,
+      "step": 5426
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.3521697660218038,
+      "learning_rate": 6.492036268609725e-07,
+      "loss": 0.5597,
+      "step": 5427
+    },
+    {
+      "epoch": 0.9649777777777778,
+      "grad_norm": 0.37050049798356277,
+      "learning_rate": 6.426695893897439e-07,
+      "loss": 0.5546,
+      "step": 5428
+    },
+    {
+      "epoch": 0.9651555555555555,
+      "grad_norm": 0.5402142513844844,
+      "learning_rate": 6.361684939914403e-07,
+      "loss": 0.5531,
+      "step": 5429
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.387153544310888,
+      "learning_rate": 6.297003428215043e-07,
+      "loss": 0.5899,
+      "step": 5430
+    },
+    {
+      "epoch": 0.9655111111111111,
+      "grad_norm": 0.37677745294000703,
+      "learning_rate": 6.232651380244536e-07,
+      "loss": 0.567,
+      "step": 5431
+    },
+    {
+      "epoch": 0.9656888888888889,
+      "grad_norm": 0.4231264377129029,
+      "learning_rate": 6.168628817339151e-07,
+      "loss": 0.5372,
+      "step": 5432
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.3568154017915155,
+      "learning_rate": 6.10493576072535e-07,
+      "loss": 0.5471,
+      "step": 5433
+    },
+    {
+      "epoch": 0.9660444444444445,
+      "grad_norm": 0.36449396883116014,
+      "learning_rate": 6.041572231520909e-07,
+      "loss": 0.5841,
+      "step": 5434
+    },
+    {
+      "epoch": 0.9662222222222222,
+      "grad_norm": 0.3602882630383994,
+      "learning_rate": 5.978538250733912e-07,
+      "loss": 0.5757,
+      "step": 5435
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.35712339853398994,
+      "learning_rate": 5.91583383926353e-07,
+      "loss": 0.5585,
+      "step": 5436
+    },
+    {
+      "epoch": 0.9665777777777778,
+      "grad_norm": 0.3877252415612054,
+      "learning_rate": 5.853459017899465e-07,
+      "loss": 0.5801,
+      "step": 5437
+    },
+    {
+      "epoch": 0.9667555555555556,
+      "grad_norm": 0.3512628538980189,
+      "learning_rate": 5.791413807322066e-07,
+      "loss": 0.5716,
+      "step": 5438
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.36079640159897963,
+      "learning_rate": 5.729698228102653e-07,
+      "loss": 0.562,
+      "step": 5439
+    },
+    {
+      "epoch": 0.9671111111111111,
+      "grad_norm": 0.34757703685942626,
+      "learning_rate": 5.668312300703193e-07,
+      "loss": 0.5626,
+      "step": 5440
+    },
+    {
+      "epoch": 0.9672888888888889,
+      "grad_norm": 0.3408285438004156,
+      "learning_rate": 5.607256045475961e-07,
+      "loss": 0.5145,
+      "step": 5441
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.3526492581847657,
+      "learning_rate": 5.546529482664542e-07,
+      "loss": 0.513,
+      "step": 5442
+    },
+    {
+      "epoch": 0.9676444444444444,
+      "grad_norm": 0.3901065834536196,
+      "learning_rate": 5.48613263240283e-07,
+      "loss": 0.5591,
+      "step": 5443
+    },
+    {
+      "epoch": 0.9678222222222223,
+      "grad_norm": 0.3759670880425791,
+      "learning_rate": 5.426065514715583e-07,
+      "loss": 0.563,
+      "step": 5444
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.34732805944771866,
+      "learning_rate": 5.366328149517985e-07,
+      "loss": 0.5679,
+      "step": 5445
+    },
+    {
+      "epoch": 0.9681777777777778,
+      "grad_norm": 0.3288592844304263,
+      "learning_rate": 5.306920556616079e-07,
+      "loss": 0.5314,
+      "step": 5446
+    },
+    {
+      "epoch": 0.9683555555555555,
+      "grad_norm": 0.4675801502975746,
+      "learning_rate": 5.247842755706556e-07,
+      "loss": 0.5856,
+      "step": 5447
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.35493248860792603,
+      "learning_rate": 5.189094766376857e-07,
+      "loss": 0.5149,
+      "step": 5448
+    },
+    {
+      "epoch": 0.9687111111111111,
+      "grad_norm": 0.348665135333137,
+      "learning_rate": 5.130676608104845e-07,
+      "loss": 0.5358,
+      "step": 5449
+    },
+    {
+      "epoch": 0.9688888888888889,
+      "grad_norm": 0.33541154220408803,
+      "learning_rate": 5.072588300259251e-07,
+      "loss": 0.5423,
+      "step": 5450
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.36040820853791644,
+      "learning_rate": 5.014829862099224e-07,
+      "loss": 0.5491,
+      "step": 5451
+    },
+    {
+      "epoch": 0.9692444444444445,
+      "grad_norm": 0.360746446168027,
+      "learning_rate": 4.957401312774668e-07,
+      "loss": 0.5153,
+      "step": 5452
+    },
+    {
+      "epoch": 0.9694222222222222,
+      "grad_norm": 0.3569077457181339,
+      "learning_rate": 4.90030267132624e-07,
+      "loss": 0.57,
+      "step": 5453
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.35158743579374396,
+      "learning_rate": 4.84353395668502e-07,
+      "loss": 0.5571,
+      "step": 5454
+    },
+    {
+      "epoch": 0.9697777777777777,
+      "grad_norm": 0.3793438134930771,
+      "learning_rate": 4.787095187672619e-07,
+      "loss": 0.5853,
+      "step": 5455
+    },
+    {
+      "epoch": 0.9699555555555556,
+      "grad_norm": 0.35437261290400834,
+      "learning_rate": 4.7309863830016233e-07,
+      "loss": 0.5223,
+      "step": 5456
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.3512962838910777,
+      "learning_rate": 4.6752075612748194e-07,
+      "loss": 0.5175,
+      "step": 5457
+    },
+    {
+      "epoch": 0.9703111111111111,
+      "grad_norm": 0.34509414671064076,
+      "learning_rate": 4.6197587409858577e-07,
+      "loss": 0.5693,
+      "step": 5458
+    },
+    {
+      "epoch": 0.9704888888888888,
+      "grad_norm": 0.3688843346545426,
+      "learning_rate": 4.564639940518811e-07,
+      "loss": 0.5393,
+      "step": 5459
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.3556890760854046,
+      "learning_rate": 4.509851178148505e-07,
+      "loss": 0.5421,
+      "step": 5460
+    },
+    {
+      "epoch": 0.9708444444444444,
+      "grad_norm": 0.3516577003029395,
+      "learning_rate": 4.4553924720400765e-07,
+      "loss": 0.5952,
+      "step": 5461
+    },
+    {
+      "epoch": 0.9710222222222222,
+      "grad_norm": 0.3467120078411363,
+      "learning_rate": 4.4012638402495255e-07,
+      "loss": 0.5165,
+      "step": 5462
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3656582330315011,
+      "learning_rate": 4.3474653007231635e-07,
+      "loss": 0.5489,
+      "step": 5463
+    },
+    {
+      "epoch": 0.9713777777777778,
+      "grad_norm": 0.3527899949266385,
+      "learning_rate": 4.293996871298167e-07,
+      "loss": 0.5644,
+      "step": 5464
+    },
+    {
+      "epoch": 0.9715555555555555,
+      "grad_norm": 0.35307655352209605,
+      "learning_rate": 4.240858569701911e-07,
+      "loss": 0.5387,
+      "step": 5465
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.48136338695401615,
+      "learning_rate": 4.1880504135525243e-07,
+      "loss": 0.5612,
+      "step": 5466
+    },
+    {
+      "epoch": 0.9719111111111111,
+      "grad_norm": 0.37135339383803606,
+      "learning_rate": 4.135572420358669e-07,
+      "loss": 0.5407,
+      "step": 5467
+    },
+    {
+      "epoch": 0.9720888888888889,
+      "grad_norm": 0.36097237660409126,
+      "learning_rate": 4.083424607519426e-07,
+      "loss": 0.5288,
+      "step": 5468
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.34795068746128205,
+      "learning_rate": 4.0316069923245216e-07,
+      "loss": 0.5696,
+      "step": 5469
+    },
+    {
+      "epoch": 0.9724444444444444,
+      "grad_norm": 0.3494201776097614,
+      "learning_rate": 3.9801195919541014e-07,
+      "loss": 0.5696,
+      "step": 5470
+    },
+    {
+      "epoch": 0.9726222222222223,
+      "grad_norm": 0.3676729833698882,
+      "learning_rate": 3.9289624234790656e-07,
+      "loss": 0.5714,
+      "step": 5471
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.35009172317281645,
+      "learning_rate": 3.878135503860403e-07,
+      "loss": 0.5597,
+      "step": 5472
+    },
+    {
+      "epoch": 0.9729777777777778,
+      "grad_norm": 0.41742279189550324,
+      "learning_rate": 3.827638849950077e-07,
+      "loss": 0.5498,
+      "step": 5473
+    },
+    {
+      "epoch": 0.9731555555555556,
+      "grad_norm": 0.3503718331337725,
+      "learning_rate": 3.7774724784902514e-07,
+      "loss": 0.5441,
+      "step": 5474
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.3622794728247527,
+      "learning_rate": 3.7276364061137327e-07,
+      "loss": 0.5834,
+      "step": 5475
+    },
+    {
+      "epoch": 0.9735111111111111,
+      "grad_norm": 0.34447645874557425,
+      "learning_rate": 3.678130649343525e-07,
+      "loss": 0.5443,
+      "step": 5476
+    },
+    {
+      "epoch": 0.9736888888888889,
+      "grad_norm": 0.33317952899149395,
+      "learning_rate": 3.6289552245935e-07,
+      "loss": 0.5133,
+      "step": 5477
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.34268997990101807,
+      "learning_rate": 3.5801101481679476e-07,
+      "loss": 0.5353,
+      "step": 5478
+    },
+    {
+      "epoch": 0.9740444444444445,
+      "grad_norm": 0.34421877002071316,
+      "learning_rate": 3.531595436261248e-07,
+      "loss": 0.5289,
+      "step": 5479
+    },
+    {
+      "epoch": 0.9742222222222222,
+      "grad_norm": 0.3739710723491484,
+      "learning_rate": 3.483411104958756e-07,
+      "loss": 0.6048,
+      "step": 5480
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.35903856974801834,
+      "learning_rate": 3.435557170236026e-07,
+      "loss": 0.5873,
+      "step": 5481
+    },
+    {
+      "epoch": 0.9745777777777778,
+      "grad_norm": 0.3577075506589855,
+      "learning_rate": 3.3880336479590325e-07,
+      "loss": 0.5417,
+      "step": 5482
+    },
+    {
+      "epoch": 0.9747555555555556,
+      "grad_norm": 0.360848677637396,
+      "learning_rate": 3.340840553884284e-07,
+      "loss": 0.5737,
+      "step": 5483
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.345084974472568,
+      "learning_rate": 3.293977903658707e-07,
+      "loss": 0.5234,
+      "step": 5484
+    },
+    {
+      "epoch": 0.9751111111111112,
+      "grad_norm": 0.34056513458405746,
+      "learning_rate": 3.247445712819763e-07,
+      "loss": 0.5447,
+      "step": 5485
+    },
+    {
+      "epoch": 0.9752888888888889,
+      "grad_norm": 0.36877192195042774,
+      "learning_rate": 3.2012439967952224e-07,
+      "loss": 0.5773,
+      "step": 5486
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.3448205492016093,
+      "learning_rate": 3.1553727709032754e-07,
+      "loss": 0.5162,
+      "step": 5487
+    },
+    {
+      "epoch": 0.9756444444444444,
+      "grad_norm": 0.4726100505417255,
+      "learning_rate": 3.1098320503527567e-07,
+      "loss": 0.5562,
+      "step": 5488
+    },
+    {
+      "epoch": 0.9758222222222223,
+      "grad_norm": 0.3762643669284478,
+      "learning_rate": 3.0646218502425886e-07,
+      "loss": 0.5429,
+      "step": 5489
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3499516844482082,
+      "learning_rate": 3.0197421855624463e-07,
+      "loss": 0.5609,
+      "step": 5490
+    },
+    {
+      "epoch": 0.9761777777777778,
+      "grad_norm": 0.37564789925550124,
+      "learning_rate": 2.975193071191984e-07,
+      "loss": 0.5668,
+      "step": 5491
+    },
+    {
+      "epoch": 0.9763555555555555,
+      "grad_norm": 0.368144490085935,
+      "learning_rate": 2.9309745219018306e-07,
+      "loss": 0.5916,
+      "step": 5492
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.3679616236587089,
+      "learning_rate": 2.8870865523525915e-07,
+      "loss": 0.5615,
+      "step": 5493
+    },
+    {
+      "epoch": 0.9767111111111111,
+      "grad_norm": 0.3690715790947065,
+      "learning_rate": 2.8435291770952945e-07,
+      "loss": 0.5981,
+      "step": 5494
+    },
+    {
+      "epoch": 0.9768888888888889,
+      "grad_norm": 0.3614304588670998,
+      "learning_rate": 2.8003024105716093e-07,
+      "loss": 0.5291,
+      "step": 5495
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.33791120472386843,
+      "learning_rate": 2.757406267113294e-07,
+      "loss": 0.5246,
+      "step": 5496
+    },
+    {
+      "epoch": 0.9772444444444445,
+      "grad_norm": 0.36684205925729624,
+      "learning_rate": 2.7148407609427493e-07,
+      "loss": 0.5573,
+      "step": 5497
+    },
+    {
+      "epoch": 0.9774222222222222,
+      "grad_norm": 0.3695401872898539,
+      "learning_rate": 2.6726059061725763e-07,
+      "loss": 0.5401,
+      "step": 5498
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4072693653699395,
+      "learning_rate": 2.6307017168057946e-07,
+      "loss": 0.55,
+      "step": 5499
+    },
+    {
+      "epoch": 0.9777777777777777,
+      "grad_norm": 0.3692389070393495,
+      "learning_rate": 2.589128206735847e-07,
+      "loss": 0.558,
+      "step": 5500
+    },
+    {
+      "epoch": 0.9779555555555556,
+      "grad_norm": 0.3391823078749012,
+      "learning_rate": 2.547885389746485e-07,
+      "loss": 0.5675,
+      "step": 5501
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.3200204008227983,
+      "learning_rate": 2.5069732795117706e-07,
+      "loss": 0.5569,
+      "step": 5502
+    },
+    {
+      "epoch": 0.9783111111111111,
+      "grad_norm": 0.3384277355644874,
+      "learning_rate": 2.4663918895961867e-07,
+      "loss": 0.5312,
+      "step": 5503
+    },
+    {
+      "epoch": 0.9784888888888889,
+      "grad_norm": 0.3283979876855168,
+      "learning_rate": 2.4261412334546376e-07,
+      "loss": 0.4883,
+      "step": 5504
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.32243300836604566,
+      "learning_rate": 2.386221324432225e-07,
+      "loss": 0.5303,
+      "step": 5505
+    },
+    {
+      "epoch": 0.9788444444444444,
+      "grad_norm": 0.35356185578856764,
+      "learning_rate": 2.3466321757644738e-07,
+      "loss": 0.5891,
+      "step": 5506
+    },
+    {
+      "epoch": 0.9790222222222222,
+      "grad_norm": 0.35582005274556755,
+      "learning_rate": 2.3073738005771062e-07,
+      "loss": 0.5725,
+      "step": 5507
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.37710228725285483,
+      "learning_rate": 2.268446211886599e-07,
+      "loss": 0.5685,
+      "step": 5508
+    },
+    {
+      "epoch": 0.9793777777777778,
+      "grad_norm": 0.41710705963907313,
+      "learning_rate": 2.229849422599073e-07,
+      "loss": 0.5389,
+      "step": 5509
+    },
+    {
+      "epoch": 0.9795555555555555,
+      "grad_norm": 0.37425483809417903,
+      "learning_rate": 2.1915834455116247e-07,
+      "loss": 0.5837,
+      "step": 5510
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.3627831290090467,
+      "learning_rate": 2.1536482933113277e-07,
+      "loss": 0.5549,
+      "step": 5511
+    },
+    {
+      "epoch": 0.9799111111111111,
+      "grad_norm": 0.38079303882544413,
+      "learning_rate": 2.116043978575566e-07,
+      "loss": 0.5439,
+      "step": 5512
+    },
+    {
+      "epoch": 0.9800888888888889,
+      "grad_norm": 0.3730370819581776,
+      "learning_rate": 2.0787705137721437e-07,
+      "loss": 0.5575,
+      "step": 5513
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.3686317814718325,
+      "learning_rate": 2.0418279112592863e-07,
+      "loss": 0.5495,
+      "step": 5514
+    },
+    {
+      "epoch": 0.9804444444444445,
+      "grad_norm": 0.37282254742977555,
+      "learning_rate": 2.0052161832850856e-07,
+      "loss": 0.5477,
+      "step": 5515
+    },
+    {
+      "epoch": 0.9806222222222222,
+      "grad_norm": 0.3452617630183563,
+      "learning_rate": 1.9689353419884982e-07,
+      "loss": 0.5481,
+      "step": 5516
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3420643396652723,
+      "learning_rate": 1.9329853993982349e-07,
+      "loss": 0.5355,
+      "step": 5517
+    },
+    {
+      "epoch": 0.9809777777777777,
+      "grad_norm": 0.3615547780550409,
+      "learning_rate": 1.8973663674337616e-07,
+      "loss": 0.5506,
+      "step": 5518
+    },
+    {
+      "epoch": 0.9811555555555556,
+      "grad_norm": 0.3732584748243469,
+      "learning_rate": 1.8620782579045204e-07,
+      "loss": 0.5777,
+      "step": 5519
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.3613884100980496,
+      "learning_rate": 1.8271210825102636e-07,
+      "loss": 0.5288,
+      "step": 5520
+    },
+    {
+      "epoch": 0.9815111111111111,
+      "grad_norm": 0.3445004688165788,
+      "learning_rate": 1.7924948528412755e-07,
+      "loss": 0.5623,
+      "step": 5521
+    },
+    {
+      "epoch": 0.9816888888888889,
+      "grad_norm": 0.3513575965184217,
+      "learning_rate": 1.7581995803778172e-07,
+      "loss": 0.5608,
+      "step": 5522
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.3621118407549723,
+      "learning_rate": 1.7242352764905712e-07,
+      "loss": 0.5891,
+      "step": 5523
+    },
+    {
+      "epoch": 0.9820444444444445,
+      "grad_norm": 0.3460912215034074,
+      "learning_rate": 1.6906019524405293e-07,
+      "loss": 0.5587,
+      "step": 5524
+    },
+    {
+      "epoch": 0.9822222222222222,
+      "grad_norm": 0.3550422459197319,
+      "learning_rate": 1.6572996193786604e-07,
+      "loss": 0.5338,
+      "step": 5525
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.34640932884176445,
+      "learning_rate": 1.624328288346577e-07,
+      "loss": 0.5376,
+      "step": 5526
+    },
+    {
+      "epoch": 0.9825777777777778,
+      "grad_norm": 0.39468485751583904,
+      "learning_rate": 1.5916879702759791e-07,
+      "loss": 0.5826,
+      "step": 5527
+    },
+    {
+      "epoch": 0.9827555555555556,
+      "grad_norm": 0.33847566406756374,
+      "learning_rate": 1.5593786759886542e-07,
+      "loss": 0.5406,
+      "step": 5528
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.3711218036793891,
+      "learning_rate": 1.5274004161970335e-07,
+      "loss": 0.588,
+      "step": 5529
+    },
+    {
+      "epoch": 0.9831111111111112,
+      "grad_norm": 0.33041180679215765,
+      "learning_rate": 1.4957532015034137e-07,
+      "loss": 0.5202,
+      "step": 5530
+    },
+    {
+      "epoch": 0.9832888888888889,
+      "grad_norm": 0.35892403257342553,
+      "learning_rate": 1.4644370424004016e-07,
+      "loss": 0.5539,
+      "step": 5531
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.3589863036505411,
+      "learning_rate": 1.4334519492711362e-07,
+      "loss": 0.5687,
+      "step": 5532
+    },
+    {
+      "epoch": 0.9836444444444444,
+      "grad_norm": 0.4011601762703607,
+      "learning_rate": 1.402797932388511e-07,
+      "loss": 0.5445,
+      "step": 5533
+    },
+    {
+      "epoch": 0.9838222222222223,
+      "grad_norm": 0.3518349011977445,
+      "learning_rate": 1.3724750019161735e-07,
+      "loss": 0.5492,
+      "step": 5534
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.33824305101839586,
+      "learning_rate": 1.3424831679075267e-07,
+      "loss": 0.5498,
+      "step": 5535
+    },
+    {
+      "epoch": 0.9841777777777778,
+      "grad_norm": 0.36741881281851074,
+      "learning_rate": 1.3128224403065048e-07,
+      "loss": 0.6051,
+      "step": 5536
+    },
+    {
+      "epoch": 0.9843555555555555,
+      "grad_norm": 0.3368415234087323,
+      "learning_rate": 1.2834928289472416e-07,
+      "loss": 0.547,
+      "step": 5537
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.3551130166564082,
+      "learning_rate": 1.254494343553847e-07,
+      "loss": 0.5621,
+      "step": 5538
+    },
+    {
+      "epoch": 0.9847111111111111,
+      "grad_norm": 0.36301317101754615,
+      "learning_rate": 1.225826993740853e-07,
+      "loss": 0.5236,
+      "step": 5539
+    },
+    {
+      "epoch": 0.9848888888888889,
+      "grad_norm": 0.33436272750414997,
+      "learning_rate": 1.1974907890131004e-07,
+      "loss": 0.5203,
+      "step": 5540
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.348942392334364,
+      "learning_rate": 1.1694857387652969e-07,
+      "loss": 0.5744,
+      "step": 5541
+    },
+    {
+      "epoch": 0.9852444444444445,
+      "grad_norm": 0.3777856821159728,
+      "learning_rate": 1.1418118522826814e-07,
+      "loss": 0.6073,
+      "step": 5542
+    },
+    {
+      "epoch": 0.9854222222222222,
+      "grad_norm": 0.3966836904203387,
+      "learning_rate": 1.1144691387405815e-07,
+      "loss": 0.5415,
+      "step": 5543
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.343002093494387,
+      "learning_rate": 1.0874576072045228e-07,
+      "loss": 0.5301,
+      "step": 5544
+    },
+    {
+      "epoch": 0.9857777777777778,
+      "grad_norm": 0.36895986842017847,
+      "learning_rate": 1.0607772666302306e-07,
+      "loss": 0.563,
+      "step": 5545
+    },
+    {
+      "epoch": 0.9859555555555556,
+      "grad_norm": 0.35532454730600016,
+      "learning_rate": 1.0344281258634069e-07,
+      "loss": 0.543,
+      "step": 5546
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.4179300742293262,
+      "learning_rate": 1.0084101936403967e-07,
+      "loss": 0.5685,
+      "step": 5547
+    },
+    {
+      "epoch": 0.9863111111111111,
+      "grad_norm": 0.34776762268395794,
+      "learning_rate": 9.827234785874107e-08,
+      "loss": 0.5414,
+      "step": 5548
+    },
+    {
+      "epoch": 0.9864888888888889,
+      "grad_norm": 0.3565924009116993,
+      "learning_rate": 9.573679892209697e-08,
+      "loss": 0.5365,
+      "step": 5549
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.3945472781566639,
+      "learning_rate": 9.323437339475715e-08,
+      "loss": 0.5597,
+      "step": 5550
+    },
+    {
+      "epoch": 0.9868444444444444,
+      "grad_norm": 0.36569837114274645,
+      "learning_rate": 9.076507210641349e-08,
+      "loss": 0.5806,
+      "step": 5551
+    },
+    {
+      "epoch": 0.9870222222222222,
+      "grad_norm": 0.39186677212776977,
+      "learning_rate": 8.832889587576665e-08,
+      "loss": 0.533,
+      "step": 5552
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.3523771507508271,
+      "learning_rate": 8.592584551053718e-08,
+      "loss": 0.5568,
+      "step": 5553
+    },
+    {
+      "epoch": 0.9873777777777778,
+      "grad_norm": 0.3620987427501741,
+      "learning_rate": 8.355592180745442e-08,
+      "loss": 0.5443,
+      "step": 5554
+    },
+    {
+      "epoch": 0.9875555555555555,
+      "grad_norm": 0.3531817696270951,
+      "learning_rate": 8.121912555226762e-08,
+      "loss": 0.5704,
+      "step": 5555
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.43565048600195005,
+      "learning_rate": 7.891545751975704e-08,
+      "loss": 0.5439,
+      "step": 5556
+    },
+    {
+      "epoch": 0.9879111111111111,
+      "grad_norm": 0.35796229388168166,
+      "learning_rate": 7.664491847370058e-08,
+      "loss": 0.5362,
+      "step": 5557
+    },
+    {
+      "epoch": 0.9880888888888889,
+      "grad_norm": 0.370210252698021,
+      "learning_rate": 7.44075091669072e-08,
+      "loss": 0.5699,
+      "step": 5558
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.35624343892069227,
+      "learning_rate": 7.220323034117238e-08,
+      "loss": 0.5281,
+      "step": 5559
+    },
+    {
+      "epoch": 0.9884444444444445,
+      "grad_norm": 0.341902259729492,
+      "learning_rate": 7.003208272734484e-08,
+      "loss": 0.6027,
+      "step": 5560
+    },
+    {
+      "epoch": 0.9886222222222222,
+      "grad_norm": 0.3759589580980951,
+      "learning_rate": 6.789406704527102e-08,
+      "loss": 0.5688,
+      "step": 5561
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.35435420877715273,
+      "learning_rate": 6.578918400380608e-08,
+      "loss": 0.5493,
+      "step": 5562
+    },
+    {
+      "epoch": 0.9889777777777777,
+      "grad_norm": 0.3550138379215988,
+      "learning_rate": 6.37174343008251e-08,
+      "loss": 0.5589,
+      "step": 5563
+    },
+    {
+      "epoch": 0.9891555555555556,
+      "grad_norm": 0.34494653894257016,
+      "learning_rate": 6.167881862324531e-08,
+      "loss": 0.5377,
+      "step": 5564
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.36600237996417695,
+      "learning_rate": 5.967333764693716e-08,
+      "loss": 0.5789,
+      "step": 5565
+    },
+    {
+      "epoch": 0.9895111111111111,
+      "grad_norm": 0.3561551322628878,
+      "learning_rate": 5.770099203683543e-08,
+      "loss": 0.5734,
+      "step": 5566
+    },
+    {
+      "epoch": 0.9896888888888888,
+      "grad_norm": 0.3582113555832297,
+      "learning_rate": 5.576178244688368e-08,
+      "loss": 0.5892,
+      "step": 5567
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.3497097442952161,
+      "learning_rate": 5.3855709520023165e-08,
+      "loss": 0.5303,
+      "step": 5568
+    },
+    {
+      "epoch": 0.9900444444444444,
+      "grad_norm": 0.3539917363723397,
+      "learning_rate": 5.198277388821504e-08,
+      "loss": 0.5463,
+      "step": 5569
+    },
+    {
+      "epoch": 0.9902222222222222,
+      "grad_norm": 0.34107607348128277,
+      "learning_rate": 5.0142976172429246e-08,
+      "loss": 0.5312,
+      "step": 5570
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.3567626598098884,
+      "learning_rate": 4.833631698265562e-08,
+      "loss": 0.5709,
+      "step": 5571
+    },
+    {
+      "epoch": 0.9905777777777778,
+      "grad_norm": 0.3373312123765254,
+      "learning_rate": 4.656279691789278e-08,
+      "loss": 0.5684,
+      "step": 5572
+    },
+    {
+      "epoch": 0.9907555555555555,
+      "grad_norm": 0.34874024947239707,
+      "learning_rate": 4.4822416566170364e-08,
+      "loss": 0.5836,
+      "step": 5573
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.3682855535615194,
+      "learning_rate": 4.311517650449348e-08,
+      "loss": 0.5652,
+      "step": 5574
+    },
+    {
+      "epoch": 0.9911111111111112,
+      "grad_norm": 0.3379455688319691,
+      "learning_rate": 4.144107729890934e-08,
+      "loss": 0.5751,
+      "step": 5575
+    },
+    {
+      "epoch": 0.9912888888888889,
+      "grad_norm": 0.35474321916641693,
+      "learning_rate": 3.980011950446283e-08,
+      "loss": 0.5935,
+      "step": 5576
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.3433754359340062,
+      "learning_rate": 3.819230366521875e-08,
+      "loss": 0.5111,
+      "step": 5577
+    },
+    {
+      "epoch": 0.9916444444444444,
+      "grad_norm": 0.3533057673642881,
+      "learning_rate": 3.6617630314261795e-08,
+      "loss": 0.5314,
+      "step": 5578
+    },
+    {
+      "epoch": 0.9918222222222223,
+      "grad_norm": 0.3790287238107738,
+      "learning_rate": 3.507609997366323e-08,
+      "loss": 0.5096,
+      "step": 5579
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3378724837986974,
+      "learning_rate": 3.3567713154525337e-08,
+      "loss": 0.5486,
+      "step": 5580
+    },
+    {
+      "epoch": 0.9921777777777778,
+      "grad_norm": 0.34264666190762966,
+      "learning_rate": 3.2092470356948066e-08,
+      "loss": 0.5473,
+      "step": 5581
+    },
+    {
+      "epoch": 0.9923555555555555,
+      "grad_norm": 0.36422873822245894,
+      "learning_rate": 3.065037207006238e-08,
+      "loss": 0.5816,
+      "step": 5582
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.35290101452688716,
+      "learning_rate": 2.924141877198583e-08,
+      "loss": 0.5676,
+      "step": 5583
+    },
+    {
+      "epoch": 0.9927111111111111,
+      "grad_norm": 0.3414961394944708,
+      "learning_rate": 2.786561092987805e-08,
+      "loss": 0.5611,
+      "step": 5584
+    },
+    {
+      "epoch": 0.9928888888888889,
+      "grad_norm": 0.35658471742544945,
+      "learning_rate": 2.6522948999874175e-08,
+      "loss": 0.5427,
+      "step": 5585
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.35565346844172646,
+      "learning_rate": 2.5213433427140333e-08,
+      "loss": 0.5786,
+      "step": 5586
+    },
+    {
+      "epoch": 0.9932444444444445,
+      "grad_norm": 0.3635110179537637,
+      "learning_rate": 2.3937064645840333e-08,
+      "loss": 0.56,
+      "step": 5587
+    },
+    {
+      "epoch": 0.9934222222222222,
+      "grad_norm": 0.3828405268996862,
+      "learning_rate": 2.2693843079168997e-08,
+      "loss": 0.5637,
+      "step": 5588
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.36113016022733274,
+      "learning_rate": 2.1483769139318823e-08,
+      "loss": 0.5789,
+      "step": 5589
+    },
+    {
+      "epoch": 0.9937777777777778,
+      "grad_norm": 0.3768477476221962,
+      "learning_rate": 2.030684322748e-08,
+      "loss": 0.5278,
+      "step": 5590
+    },
+    {
+      "epoch": 0.9939555555555556,
+      "grad_norm": 0.3406123490543402,
+      "learning_rate": 1.9163065733873718e-08,
+      "loss": 0.5495,
+      "step": 5591
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.35930426196369447,
+      "learning_rate": 1.8052437037707758e-08,
+      "loss": 0.589,
+      "step": 5592
+    },
+    {
+      "epoch": 0.9943111111111111,
+      "grad_norm": 0.36027880279376373,
+      "learning_rate": 1.6974957507231993e-08,
+      "loss": 0.5629,
+      "step": 5593
+    },
+    {
+      "epoch": 0.9944888888888889,
+      "grad_norm": 0.35053229120586743,
+      "learning_rate": 1.593062749967178e-08,
+      "loss": 0.5709,
+      "step": 5594
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.36176302718256115,
+      "learning_rate": 1.4919447361283477e-08,
+      "loss": 0.5147,
+      "step": 5595
+    },
+    {
+      "epoch": 0.9948444444444444,
+      "grad_norm": 0.3479530557886071,
+      "learning_rate": 1.3941417427321135e-08,
+      "loss": 0.5932,
+      "step": 5596
+    },
+    {
+      "epoch": 0.9950222222222223,
+      "grad_norm": 0.35241188990158684,
+      "learning_rate": 1.299653802205869e-08,
+      "loss": 0.5368,
+      "step": 5597
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3477666321556078,
+      "learning_rate": 1.2084809458756675e-08,
+      "loss": 0.5283,
+      "step": 5598
+    },
+    {
+      "epoch": 0.9953777777777778,
+      "grad_norm": 0.3511916467782138,
+      "learning_rate": 1.1206232039728814e-08,
+      "loss": 0.5763,
+      "step": 5599
+    },
+    {
+      "epoch": 0.9955555555555555,
+      "grad_norm": 0.33903781649015696,
+      "learning_rate": 1.0360806056242123e-08,
+      "loss": 0.5349,
+      "step": 5600
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.3908329648053974,
+      "learning_rate": 9.548531788605707e-09,
+      "loss": 0.5953,
+      "step": 5601
+    },
+    {
+      "epoch": 0.9959111111111111,
+      "grad_norm": 0.33430626933594315,
+      "learning_rate": 8.76940950612637e-09,
+      "loss": 0.5574,
+      "step": 5602
+    },
+    {
+      "epoch": 0.9960888888888889,
+      "grad_norm": 0.33532419227127347,
+      "learning_rate": 8.023439467141902e-09,
+      "loss": 0.536,
+      "step": 5603
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.339786407813064,
+      "learning_rate": 7.3106219189655875e-09,
+      "loss": 0.5509,
+      "step": 5604
+    },
+    {
+      "epoch": 0.9964444444444445,
+      "grad_norm": 0.4177124318949463,
+      "learning_rate": 6.630957097930601e-09,
+      "loss": 0.5578,
+      "step": 5605
+    },
+    {
+      "epoch": 0.9966222222222222,
+      "grad_norm": 0.3462871505625693,
+      "learning_rate": 5.984445229390012e-09,
+      "loss": 0.5053,
+      "step": 5606
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.3344964090228748,
+      "learning_rate": 5.371086527683478e-09,
+      "loss": 0.5295,
+      "step": 5607
+    },
+    {
+      "epoch": 0.9969777777777777,
+      "grad_norm": 0.35909450849651425,
+      "learning_rate": 4.7908811961816514e-09,
+      "loss": 0.552,
+      "step": 5608
+    },
+    {
+      "epoch": 0.9971555555555556,
+      "grad_norm": 0.3575909896274203,
+      "learning_rate": 4.2438294272528765e-09,
+      "loss": 0.5351,
+      "step": 5609
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.34417726966370205,
+      "learning_rate": 3.7299314022631874e-09,
+      "loss": 0.5107,
+      "step": 5610
+    },
+    {
+      "epoch": 0.9975111111111111,
+      "grad_norm": 0.3617979529916427,
+      "learning_rate": 3.249187291609612e-09,
+      "loss": 0.5871,
+      "step": 5611
+    },
+    {
+      "epoch": 0.9976888888888888,
+      "grad_norm": 0.3860283996651874,
+      "learning_rate": 2.8015972546646674e-09,
+      "loss": 0.5862,
+      "step": 5612
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.34794627497920005,
+      "learning_rate": 2.387161439854069e-09,
+      "loss": 0.5626,
+      "step": 5613
+    },
+    {
+      "epoch": 0.9980444444444444,
+      "grad_norm": 0.3702338315420281,
+      "learning_rate": 2.005879984556813e-09,
+      "loss": 0.556,
+      "step": 5614
+    },
+    {
+      "epoch": 0.9982222222222222,
+      "grad_norm": 0.3715422126141103,
+      "learning_rate": 1.657753015205099e-09,
+      "loss": 0.5546,
+      "step": 5615
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.35147150675439054,
+      "learning_rate": 1.3427806472177118e-09,
+      "loss": 0.572,
+      "step": 5616
+    },
+    {
+      "epoch": 0.9985777777777778,
+      "grad_norm": 0.3374125581857282,
+      "learning_rate": 1.0609629850222292e-09,
+      "loss": 0.5381,
+      "step": 5617
+    },
+    {
+      "epoch": 0.9987555555555555,
+      "grad_norm": 0.3728159392608189,
+      "learning_rate": 8.123001220550208e-10,
+      "loss": 0.5776,
+      "step": 5618
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.35312986373587263,
+      "learning_rate": 5.967921407612487e-10,
+      "loss": 0.5096,
+      "step": 5619
+    },
+    {
+      "epoch": 0.9991111111111111,
+      "grad_norm": 0.3860609285919614,
+      "learning_rate": 4.144391126059688e-10,
+      "loss": 0.5974,
+      "step": 5620
+    },
+    {
+      "epoch": 0.9992888888888889,
+      "grad_norm": 0.33070673654349286,
+      "learning_rate": 2.652410980186204e-10,
+      "loss": 0.5489,
+      "step": 5621
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.3257573690291597,
+      "learning_rate": 1.4919814649294594e-10,
+      "loss": 0.5492,
+      "step": 5622
+    },
+    {
+      "epoch": 0.9996444444444444,
+      "grad_norm": 0.3290364798907909,
+      "learning_rate": 6.631029648707099e-11,
+      "loss": 0.5302,
+      "step": 5623
+    },
+    {
+      "epoch": 0.9998222222222222,
+      "grad_norm": 0.3601730077323053,
+      "learning_rate": 1.6577575501219854e-11,
+      "loss": 0.5653,
+      "step": 5624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.35874852658182327,
+      "learning_rate": 0.0,
+      "loss": 0.541,
+      "step": 5625
+    },
+    {
+      "epoch": 1.0,
+      "step": 5625,
+      "total_flos": 5002338659401728.0,
+      "train_loss": 0.6259202656692928,
+      "train_runtime": 89183.5058,
+      "train_samples_per_second": 1.009,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 5625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5002338659401728.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..53481dd51193a0e71928271293246738288877dc
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f02a35e8a576e842bc12654048b6e9d5e4215ef0
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "gate_proj",
+    "o_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8eed616562ddec43e9cb7b0568fc4e34492df209
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f23be53bc331681a0eba4d7cd4a5068be16c7d7fde77a0cb8283b026b9c3940a
+size 671150064
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..14d0036f2d6ef7a43e27dd6ab3975619d8bb57a4
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 1152,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": false,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..36f429b81781b66e27ce12b99b246e24626f811a
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1389ce5936565078a758a0a88966971911a10e44b104a48ad0fce7876622673
+size 899633034
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..faf7d29d8e68afc9227e643496ccf27c8e88977f
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9033404973257572,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3702,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9200655478154655,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4091,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8568115900101086,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3421,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7276561318273437,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.2378,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.80630356176694,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1796,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8324188030112533,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.1077,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.8239631251583869,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9959,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7639673779925863,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9213,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8024484626222647,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.853,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6531227626647346,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8696,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5003114081934658,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8229,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.43361106978087643,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8606,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.3989391164413557,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8001,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.37635276210072166,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.736,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.3749720016620839,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.7812,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.38822191417536056,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8408,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.33905688002757933,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.7776,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4009310777468807,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8082,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.396512227244631,
+      "learning_rate": 0.0002,
+      "loss": 0.7607,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.38131463269530463,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8209,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.388771662310261,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.737,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.37535124351711646,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7578,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.33381879122801084,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.7329,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.34652640125583617,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.7115,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.3449929315808586,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.7175,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.3458815755530234,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8123,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.342032875398569,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7321,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.31766514898039433,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.7129,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.32126953003960074,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7274,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.32536127409631893,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.7637,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.314487984166115,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.6898,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.31136998311660374,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.684,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.32482632417484913,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.7235,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3099515319366386,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.6873,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.30107064068065836,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.6785,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.2941824903305334,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.6513,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.30994923914853884,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7031,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3037113147667233,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7109,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.3088199641547337,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7703,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.30294503947884,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7064,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.3009478698388032,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7027,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.32279287397339806,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.6876,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.30000672326660394,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.6574,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3013197784172169,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.6766,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.30003465545797525,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.718,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.28608785901924544,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.6536,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.29444853278829763,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.6514,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.2962330894917321,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.6802,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.29757697902300934,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.6645,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3219996792532637,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.6974,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.2885633090251933,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.683,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3032543344053866,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.712,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.2906786212439371,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7192,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.30456504810978957,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7153,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.3109362660245635,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.6616,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.29946421248714533,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.6651,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.2741698159306603,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.5766,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.2932323716173052,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.6815,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.3039664118741775,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7063,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.2835424116558143,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.6568,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.2903373260317771,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.6829,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.2908879079771421,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.6827,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.2820705964805036,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.6353,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.282779416359093,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.6722,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.2968739512906537,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7101,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.307347798581539,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.6958,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.2884637033507374,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.6534,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.30038576733393246,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.6998,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.29192806208266164,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.6843,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.2828143400360726,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.644,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.28909264028318,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.6869,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.30696934408671855,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.6412,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.28023002743316044,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.6529,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.28117857965998544,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.6315,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.28029665183934693,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.6506,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.2949384146250226,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.6907,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.28744268327217054,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.6445,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.29983300948861297,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.6671,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.3038698684581471,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.6499,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.2856930064996507,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.6292,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.28690016312392796,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.6734,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.29270353198941823,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.6628,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.28376580045137256,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.6208,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.2970281168568997,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.6468,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.2971345990174716,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.6821,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.29653412404879576,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.6553,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.27626070918423495,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.6238,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2876338029240312,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.6563,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.2965477447426151,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.6786,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3487212794023846,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7069,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.2878640107558607,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.6559,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.28113466605686666,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.6509,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.29288843234343526,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.6917,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2849405008631043,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.6801,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.2860251354270671,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.6768,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.28640334084107516,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.6338,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.28274623890495537,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.6042,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.29635263771652154,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.6534,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.28085511906044697,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.6577,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2994093719296395,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.6635,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.28016135788866825,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.6306,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.2859276049903214,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.6573,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.2949529432094483,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.6707,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.2952858148380289,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.6688,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.29687433263724833,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.6557,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3274603736840435,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.6333,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.2856127394350284,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.6575,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.2909543989200667,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.6626,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.3050216754760076,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.6738,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.2977253243133897,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.6509,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.2830623583930099,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.6083,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.2778721417914392,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.5844,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.2887698595117564,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.6406,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.29644839745372237,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.6733,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.29019043045865145,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.6343,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.31156049249994183,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7005,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.29428332863315004,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.6335,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.31497995062962403,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7361,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.2890976895301539,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.6126,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.2954134281814498,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.6535,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.28112456697322546,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.6482,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.28861115880222316,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.6562,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.30494694193499206,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.6658,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3004982208010861,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.634,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2901946576962155,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.6663,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.30503788029150036,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.5757,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.2891040387420322,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.6926,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.271266483580161,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.6163,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.27626211178457927,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.6008,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.28647957333813995,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.6492,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.29056701940711743,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.6904,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.29207159590757975,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.6857,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.29451399058695804,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.6291,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.2885154068087507,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.6329,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.26861913834773543,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.6224,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.28048011104639864,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.6187,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.29326686568989035,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7009,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.29280070964817745,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.65,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.27106551349366576,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.6213,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.2943347329126408,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.6451,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.2891926431972801,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.6589,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.2805113538689055,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.6387,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.29902427992148634,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.6303,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3011522421150295,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.6969,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.2800238928479077,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.5755,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.27143926253293926,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.6028,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.2943161871981876,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.6681,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.289334392820797,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.67,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.28660574491977214,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.6533,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.28302827878206543,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.674,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.29249601311566104,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.6481,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.291768170270842,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.651,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.28391259096195254,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.6293,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.2810392291736171,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.6175,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.2867065084778468,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.6527,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.2770673446738684,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.651,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.27039777357068207,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.6344,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.2804212430044406,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.6521,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.2853133905615482,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.6763,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.2786207325539522,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.6077,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.28095395945426455,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.6288,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.28815654977920335,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.5976,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.28660888249754035,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.6374,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.40957193545608156,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.5966,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.2898913111516385,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6482,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.2786245395654543,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.6125,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.2837721281604903,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.6363,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.2753352987391294,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.6374,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.26440385164137004,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.557,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.2857385564623794,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.6179,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.2874021322021662,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.6356,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.27466595495457424,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.6043,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.28062479869313933,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.5984,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.26942030615250845,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.5955,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.295863709429359,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6662,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.28680145914254224,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.6586,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.26897503141668383,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.6134,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.26840146894603445,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.6016,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.26929199365588663,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.5914,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.27139751196806866,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.6303,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.3021205383636816,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.6159,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.28120367032543214,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.6679,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.2914956157054564,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.685,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.2711325731767127,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.595,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.27653328851479647,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6172,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.2705518127553975,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6154,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.27396647728634504,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.5925,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.29077363793367056,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.6423,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.2670728836626214,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.5596,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3084907250837108,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.693,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.28989918915509644,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.6512,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.26930064508030843,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.6083,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.2786463373437597,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.6663,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2617740825176673,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.5798,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.2655433417635973,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.6103,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.2772930381756503,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.6662,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.2739856896351615,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6097,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.27557629883711054,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6375,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.27001717421979243,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.6071,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.27751369294324196,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6477,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.2615767149282083,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.5764,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.27820796119436153,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.6181,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.26386382755500865,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.5821,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.2731116018516801,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6133,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.26748852975928117,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.5941,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.2936091441118259,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6236,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.28627359457145607,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.6127,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.2738706815025388,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.6183,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.2678852345808725,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.5853,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.26977041336993773,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.6083,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.28709476977831255,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.6405,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3023809565549881,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7043,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.2747892811127097,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.6532,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.2769456334862002,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.6155,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.26813200787234687,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.6019,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.2750495110055987,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6278,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.27039140796375544,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.6389,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.2670339648572964,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.6254,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.26648845002200316,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.5973,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.2751017457817988,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6513,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.28732132512944286,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6752,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.27886124697137193,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.6554,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.2632289008248714,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.5678,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.2766947840759987,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.6319,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2709193107694985,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6424,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.26920447067686964,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.579,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.26282569011013873,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.5803,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.30253062757654425,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6894,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.28185798122986966,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.6701,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.2781631405974303,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.6409,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.2783668408558492,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6235,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.27110724885666837,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6276,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.27314657189067826,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6402,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.27542798995603934,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6218,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.2596868094828067,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.5767,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.28129216441253807,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.61,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.26316051294531795,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.6103,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.2625379397079638,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.5954,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.26252034826320153,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6072,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2697403042950616,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.6324,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.2694590499091851,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.5831,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.2799199854667609,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.6431,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.2743505833018257,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.5958,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.2651616201833635,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.6146,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.2752066748441881,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.5988,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.2654586037334672,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.6081,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.26528475777384997,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.6025,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.2770333595628385,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.6071,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.27275921881990134,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.5878,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.27460484347510516,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.6185,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.28627779272830833,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6642,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.2843553197115947,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.6002,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.2710438183016861,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.5952,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.28027455054015277,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6283,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.2888905069397006,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.6298,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.2706277103676121,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.6021,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.2651883807306946,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.62,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.28550563597822987,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6325,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.27094358518408534,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.5908,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.2696764320997584,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.5853,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.2657302789360468,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.6118,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.25881513219930835,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.5788,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.2692447885970564,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.5949,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2718587692319407,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.5973,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.30912170033120123,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7001,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.2895514425791462,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.5999,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.2716240374125864,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6194,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.26039229890804566,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.5755,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.26328712426525025,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6225,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.2632352487950469,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.6173,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.2679355024595054,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.5877,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.2764596288907185,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6295,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.26393403511341645,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.5606,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.27432367106489686,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6311,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.2705500655990583,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.58,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.259559547111268,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.5805,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.25098071679480816,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.5676,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2601878941499058,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.585,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.2621421870871061,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6069,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.26963854802455367,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6408,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.2585727486396966,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.5798,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.2722397010778486,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.6109,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.26035612594536284,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.5784,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.26856540487699954,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6243,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.26833969016804327,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6047,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.26631996087657145,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6033,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.25471868704875356,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.5583,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.26038533587620166,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.5813,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.2680253814911395,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6196,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.26645922323423354,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.634,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.26714834506150237,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.6113,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.27089839537865856,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6277,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.2694898217685149,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.5931,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.25363892775400476,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.5465,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.25334437860494907,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.5622,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.2649852486157058,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.5981,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.2835099156763055,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6665,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.26830434990486457,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6173,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.267287012411873,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6243,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2609447173808279,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.5885,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.25543369285777234,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.5573,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.2671858794438797,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.5943,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.2593955077079989,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.5934,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.2823053871701031,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6346,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.26018937907020806,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6209,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.2675372716966033,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6518,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.26163986747743156,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.643,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.2658825302064493,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6278,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.2642568671878393,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6121,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.26205105488837227,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.5926,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.24945629808113587,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.5439,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.2705335916411653,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.624,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.2645126634344591,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6149,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.2610460612633549,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6336,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.26002849948292583,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.5677,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.25609352645097033,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.5538,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.27158974314716067,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6088,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.2574000836503734,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.5901,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.2748624667047697,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6442,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.26669487646166096,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6091,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.2777401885135166,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.5698,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.2593552962147439,
+      "learning_rate": 0.0001,
+      "loss": 0.5972,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.2591542124686353,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.5676,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.25631941791655843,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.5971,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3016872466120396,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6017,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.2654014532106818,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6278,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.2544257649303829,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.5565,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.255499489168914,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.584,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.25565563033270083,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.606,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.25498787311023613,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.5737,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.2619218882769582,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.5824,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2638213417602552,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6056,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.26995009893117944,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6139,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.2542149578732742,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.5476,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.2513914064071582,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.5626,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.2680264121730716,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.5889,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.2555106002284551,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.5809,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.25875832684290795,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.5749,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.2556305902909127,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.5632,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.26185715283400635,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.5792,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.263891085361733,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.5838,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.24763138436301804,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.5698,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.2632830918798873,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.598,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.29082792629350307,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6009,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.2528825949585721,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.5542,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.25513868134702117,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.5803,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.2606526761809016,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6091,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2644594971115932,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6165,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.26082804034908924,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6031,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.25984517935489154,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6031,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.26762902286838963,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6284,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2586268717313893,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.5758,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.26965726852970207,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.606,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.27752151533234787,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6414,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.2524249669020979,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.562,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.259208248804178,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.5783,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.2629825216619024,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6019,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.27395230361912726,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6126,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.26373286997161893,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6125,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.25465433672334553,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.5687,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.2559530815925589,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.5918,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.258837584421683,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6003,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.26429023731078133,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6279,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.2548865767012274,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.5805,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.33139314641727463,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.673,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.25474566299712065,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6018,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.27405703687372945,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6131,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.25215109009251463,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.5759,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.2624619027250661,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.5563,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.2757839250369891,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6195,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.2702293320510406,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6358,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.2633118638050873,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6259,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.26060980759540464,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6125,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.261668096986087,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.5948,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2622527701047893,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.5977,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.25217138102728326,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.5847,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.25792026179637567,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.599,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.2574842732152005,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.5711,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.25043437723529727,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.5556,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.24459799810234237,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.5551,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.2597607979330205,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.5947,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.2609330599899115,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6233,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.26861033183050814,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6213,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.25914606292067016,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.5887,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.2522124490494597,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.5696,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.2638978941366862,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.604,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.2482187971750229,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.5346,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.2594744324645565,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.5865,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.25836648727917544,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.58,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.25013205186206017,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.5724,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.2684922316211901,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.5715,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.2507704655034135,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.5731,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.2604261604890052,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6268,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.25478837808916244,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.5484,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.2618405043606778,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.5823,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.2647994946291963,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6142,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.2504158827408613,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.5829,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2643682028445015,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6289,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.2726027118055879,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6087,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2656399134442502,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6232,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.2890321879342334,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.611,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.2500196584931281,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.5766,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.24714116207627002,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.5577,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.25915690622659465,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.5897,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.2582280893466348,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.5353,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.2493600118309958,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.5764,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.2696840274791407,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6043,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.2556281813318383,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.5931,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.262313158083901,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6144,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.26452950042359274,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.5894,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.2668491364046421,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6079,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.26169529088395,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.5893,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.2639409061834857,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6301,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.26920981548851625,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6425,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.24533060603048004,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.5441,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.24769714300264078,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.5517,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.2559740780545734,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.5569,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2567723917323897,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.5722,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.2563199159419971,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6036,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.2541037005115942,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.5595,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.2589425304040646,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.5767,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.2584911309515902,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.5766,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.2506728798970084,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.5728,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.26960684664060885,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.5557,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.24805931464523356,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.5457,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.25987190280249384,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.5746,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.2573451608183604,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.5799,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.2654122685222725,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.5924,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.2597690654249084,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6055,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.25243145661314076,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.5289,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.25824439286544515,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.5814,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2544582038004977,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.5619,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.27471726182067324,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6207,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.26012264505830507,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6023,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.24092441320395502,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.5172,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.25352004959955127,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.5565,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.25212776924369507,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.5687,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.2616338146981191,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6115,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.2552747385884955,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.5545,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.29390560725250053,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.5344,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.24986156430741233,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.5611,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.25614139660011026,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.5625,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.2538962500697107,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.5843,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2722022908484456,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6219,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.25999053783595505,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.5811,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.2632170630137,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.5454,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.2589898494047925,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.5568,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.24241528073093044,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.5541,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.2792473797078321,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.5895,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2493853268888264,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.5597,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.2587460910617057,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6024,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.25970986514457856,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6028,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.2627758245134774,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.5857,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.25925746428700547,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.5724,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.41712529798668274,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.5884,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.25078015135979354,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.5775,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.2565385394651964,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.5995,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.25156874540996954,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.5586,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.24487007483044448,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.5574,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.2598900854989058,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.5817,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.24895228375299977,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.5347,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.2661519054238711,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6037,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.24630003568969208,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.5472,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.25625016668788175,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.5795,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.2505624784270415,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.5897,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.43546402945514173,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.5565,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.2626787083120136,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6334,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.25090989395570357,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.5973,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.25336725221102824,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.5828,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.24057783653954204,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.5532,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.23959198782404773,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.5277,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.24774383573640013,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.5675,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.24960252681386738,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.5678,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.2484795625510042,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.5662,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.24545186743564423,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.5524,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.2516789090041251,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.5912,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.2607401899793407,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6134,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.2585203137406373,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.5772,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.25385753554872703,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.5921,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.24600250850794042,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.5698,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.2576103052865085,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.5909,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.25480879506443094,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.5544,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.2503529255141808,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.5267,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2586930060114874,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.5903,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.2530021008014392,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.5622,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.25162412800145234,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.5582,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.2531143883528878,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.5705,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.2438321700064606,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.5522,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.26017372144360845,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.5923,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.2592013861597092,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.5751,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.2604178559013015,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6108,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.24911407942551256,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.5424,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.2738283985167626,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.5523,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2425227355627501,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.5421,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.2656925026244092,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.602,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.24927991955892181,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5562,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.2517727024587019,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.5613,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.2539945051749779,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.5609,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.26460778710159644,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.5789,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2501401457440102,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.5767,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.25108695158570427,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.5748,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.26144078355869815,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.5559,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.2424811775551495,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.5232,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.2629144195906486,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.5582,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.2632536996658287,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.5965,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.2578948318399331,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.5688,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.25383747050695094,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.5569,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.2574345951818969,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.547,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.27222780005809916,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.5861,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.2606781785106011,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.5884,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.2551876492978511,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.5734,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.24618973056746155,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.5319,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.2591979469770761,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.5937,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.2573694155924506,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.5453,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.2504361140343124,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.5583,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.2469292086578109,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.5438,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.25818318752576547,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.5827,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.2671447662861101,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.5438,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.26518579024890976,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.5935,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2682839201056169,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6102,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.2668231948844124,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6063,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.2568029936082383,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.5416,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.2490028841911143,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.5445,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.25650024875764577,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.545,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.25741474548439974,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.5447,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2562124919206727,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6024,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.26154778091279046,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.5936,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.2655385936068879,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6057,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.25352887609445324,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.5775,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.25670604310461265,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.5839,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.2581312700128164,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6202,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.2587743828479023,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.578,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.25323698738285116,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.5811,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.26379924381872405,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6065,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.25970803047318636,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.5775,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.24513668859607107,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.5411,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.2593808847300968,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.5887,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.26069262793188025,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.5884,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.255292393884229,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.5643,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.28751175707868937,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.5643,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.25959765605546553,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.5339,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.24980709115022895,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.5695,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.2452209267199703,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.5165,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.2524793924892976,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.5597,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.26606410256191265,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6186,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.2533467970780035,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.5616,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.25755411819862256,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.5627,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2575217298135753,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.581,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.24982665016048144,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.5396,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.2641747404541249,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.5777,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.2756926446548946,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.5419,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.24519930818122074,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5412,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.25527219219142355,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5671,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.24507775526680298,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5435,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.25279856928031735,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.5232,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.24229897895278965,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.4994,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.24367438318292856,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.5668,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.24611400184096086,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.5576,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.25894143808589626,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.5907,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.2544556139308709,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.5649,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.25409370996140834,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.5291,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.24278223953313688,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.5242,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.2692566239437579,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6204,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.24806440013057024,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.5665,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.2461628864242156,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.5538,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.2559122982021189,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.604,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.2606246789860794,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.5897,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.2531046625803253,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.5381,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.2644909172549217,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6065,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.25998843134511657,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5756,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.2438607876632911,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5519,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.2557809493211503,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.5794,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.24356844865025518,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.5457,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.2554892701473439,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.5605,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.2463181662462567,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.5622,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.24714707374932277,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.5722,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.2523035198417324,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.5323,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.2446822675297152,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.5344,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.257524947253814,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.593,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.26661384217269185,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.571,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.2626126806242866,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.5903,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.25084123396603797,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.5972,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.24678763420634603,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.5488,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.2548720457798887,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.5851,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.25074968600043734,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5592,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.25708420318674485,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.584,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.2589322326993553,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6178,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.25656778306444317,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.5435,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.2558065441349567,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.5498,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.24053570452087028,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.536,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.2716175789734306,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.5876,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.2525541128663851,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.5904,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.25817865905570425,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.5889,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.26151537175840095,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.5729,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.24225587612941454,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5125,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.2628493753306178,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.5848,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.24049741007521916,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.5325,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.25287426448549716,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5807,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.2552367256720679,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.5416,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.24544619556081892,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.5677,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.30400369196431554,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.5902,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.24800737902624073,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.5535,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.25712522301314844,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.5875,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.2528708741278371,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.5716,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.25681466531883856,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.5543,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.24032602027009492,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.5213,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.25652917793532737,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.5792,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.26091177722146935,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6043,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.25003264999584973,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.5345,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.24675687713551328,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.5462,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.25019378660615477,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5657,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.2528106317795277,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.555,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.26073189355575477,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.5597,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.25137663391032267,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.5659,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.2488256428995093,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.574,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.24954010199025753,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5641,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.23994659654705344,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.5419,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.2550508858357467,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5831,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.24443138036583872,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.5389,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2607132224613469,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.5493,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.25489244073480916,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.5677,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.23953609103058476,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.5311,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.23741179011189767,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.523,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.251135351613199,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5472,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.24728636723210387,
+      "learning_rate": 0.0,
+      "loss": 0.5395,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 372789103198208.0,
+      "train_loss": 0.6182057282924652,
+      "train_runtime": 8257.1583,
+      "train_samples_per_second": 1.211,
+      "train_steps_per_second": 0.076
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 372789103198208.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..53481dd51193a0e71928271293246738288877dc
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6964ef6c9145e5d4c10ba065397534061cd2c5fb
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..853514ea06ae28826f5e3262aa0a07c76d6eb9a1
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65ff6fb22dfaec71c00b932805e5dee92f9b92d22dc3cd5f6f58dcdcafd3d949
+size 671150064
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..14d0036f2d6ef7a43e27dd6ab3975619d8bb57a4
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 1152,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": false,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..02c09000e50f126169efa2be79b44cc6f2d03acd
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d7005b204a93bd5f662d33d61974e83298a58f3195ae20f0ace5eb2d7251256
+size 899633034
diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7cd300a9de528940536bda1a8551389bb37fc5e
--- /dev/null
+++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9456431410164449,
+      "learning_rate": 2e-05,
+      "loss": 1.4684,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9595554488188345,
+      "learning_rate": 4e-05,
+      "loss": 1.3749,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7819824872221672,
+      "learning_rate": 6e-05,
+      "loss": 1.3507,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9167023815526777,
+      "learning_rate": 8e-05,
+      "loss": 1.2165,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.847495342554959,
+      "learning_rate": 0.0001,
+      "loss": 1.1304,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8818427732884414,
+      "learning_rate": 0.00012,
+      "loss": 0.9715,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7726751380491141,
+      "learning_rate": 0.00014,
+      "loss": 0.9336,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6483561690537337,
+      "learning_rate": 0.00016,
+      "loss": 0.8903,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5687921838840364,
+      "learning_rate": 0.00018,
+      "loss": 0.7977,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4217197943554311,
+      "learning_rate": 0.0002,
+      "loss": 0.8058,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.39301658901144887,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.8494,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4590183081730723,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8636,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.41874721337638304,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.8045,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.36156401408882166,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.7129,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.43359389552236977,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.737,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3732292263108806,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.721,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3672574865848022,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.768,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.3788969898154259,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.6994,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3591018825770351,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.7546,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.3347980700973235,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.7799,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.3176137835338356,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.718,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3110224161054545,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.6712,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3176740651877855,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.7517,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.31311011452703175,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.7431,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.31035422091689385,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.7121,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.2986181957125755,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.6987,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.32611739837441994,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.7732,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.290949601128612,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.6783,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.30951784533907173,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.7126,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.30642887130649027,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.6716,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3253752205567512,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.7618,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3074017490574547,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.6863,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.2991620525987948,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.7061,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.3226295279455581,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.7164,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.2942063456789797,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.6732,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.30145830255706224,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.6917,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.2976180783817878,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.6688,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.28814304785348366,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.6852,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.2968046184063759,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.6827,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.2848652475342455,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.6905,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.29541131745521493,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7409,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.30472745054693,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.6976,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.287978936057206,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.6889,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2917374421163095,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.7072,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3014118067717161,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.736,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.27940318160556293,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.6561,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2764033621334266,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.6425,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.286603211032781,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.6984,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.29296348821168333,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.6898,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.28097584337963444,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.6858,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.30154798580564207,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.7145,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.30795671266773783,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7126,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.28661331080986213,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.6722,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.27995432204171594,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.6533,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.2881701437861025,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.6864,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.2863440297333805,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.6545,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.2880700843682891,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.6694,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.2774792629615745,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.6505,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.2737154598370057,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.6388,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.27952240531043854,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.6802,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.28225508446141556,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.6638,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.28445391301141404,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.6997,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.27249304099350813,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.665,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.27827800737681285,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.6747,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.28236239197955565,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.6804,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.27711713330173016,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.6825,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.27559978427300996,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.6893,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.28871955197128163,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.6574,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2840675164080103,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7286,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3083968957018995,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.6578,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.2843871593471055,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.6792,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.2791531418567351,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.6779,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.28436440881242775,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.6461,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.28210657946106504,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.6376,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.27471101989657526,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.6589,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.27255559553885733,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.6624,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.28238069396965426,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.665,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.28091292280312435,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.6582,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.2819117395356436,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.681,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.2808167623131368,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.6635,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.2727980766577742,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.6387,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.27946610199292726,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.6636,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.2776930025762807,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.6629,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.29547305795196493,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7105,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.27992559053351135,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.6217,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.2836978378018261,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.6364,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.2859793795996647,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.6611,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.27670508997854193,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.6321,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.2804552643700331,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.654,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.2988662193267764,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.6631,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.2850945559743476,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.6546,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.34508854455612903,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.6181,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.29772302651197313,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.6953,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.2814403372374441,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.6711,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.2666134049420953,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.6384,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.2759217127531887,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.5927,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.27234087211717334,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.6047,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.28929394588800267,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.709,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.27285248521549677,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.6275,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.27087214179135655,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.6288,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.2759835535783692,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.6529,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.2732937178984582,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.6443,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.27150370023008635,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.6401,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.288480165883741,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.661,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.2715185186712244,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.5861,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.2890522618399928,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.6547,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.2809942944669504,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.6502,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.2770621559504025,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.651,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.28643434801423096,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.6717,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.2838267721875221,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.6666,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.2904515511293253,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.6788,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.27521059578174806,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.6421,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.28435477767209977,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.6806,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.2836689862948711,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.6633,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.27993720267562205,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.6031,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.26823803322012507,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.6258,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.2755753433763202,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.6238,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.26378813575701704,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.6105,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.27807647013178804,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.634,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2850302043584264,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.6473,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.2842021607325469,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.6666,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.2646241757350502,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.5882,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.27553697595802684,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.6553,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.266869763753527,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.6334,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2891373420552467,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.6232,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.29054049719448544,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.6685,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.26991808658939265,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.6451,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.2826019428481463,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.6748,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.26989180549288744,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.6366,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.2693716385431619,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.6365,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.2587716538530605,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.5821,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2733425696050926,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.6601,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.28260898546044794,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.6598,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.26073598361962874,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.6062,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.26672014153172025,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.6664,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.27064468874961567,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.6409,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.2678777921010367,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.6146,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.2831646298939026,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.6641,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2863253592057525,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.6653,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.26496566477700495,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.6061,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.2741590428796881,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.683,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.274967394883945,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.6738,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.26748184911275685,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.6259,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.27571463833666354,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.6824,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.28204383254583004,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.654,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.2737273526998308,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.6383,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.26718453878335136,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.6336,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.28928334642647074,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.6328,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.27269344035826515,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.6261,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.27361132880394723,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.6423,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.2666674720378228,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.6397,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.26339311415617495,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.6184,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.30511330160542777,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.631,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.27393530074277894,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.64,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.2630950654462822,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.6256,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.26558776558003316,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.6329,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.27737305767773224,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.6808,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.34438277093807235,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.628,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.2716378054950986,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.6197,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.25713458936107125,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.6055,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.2556414802851472,
+      "learning_rate": 0.0001,
+      "loss": 0.6088,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.9117970631393475,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.6301,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.26669052461677767,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.6326,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.27925439381355327,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.6681,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.25468990353229803,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.5743,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2629677375763853,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.6055,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.26370207518366157,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.6102,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.26441992972242784,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.6238,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.2798821979607459,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.6568,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.2748187095456481,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.6068,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.26830989542228445,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.6217,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.2624748827277877,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.6203,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.2552606780949764,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.5663,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2557632710956968,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.5918,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2666361550200854,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.625,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2560883378063079,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.6032,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.2583406590237059,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.6075,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.27012077672072304,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.645,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2697472104569865,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.6357,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.2704698732844997,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.6278,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.27596503294944713,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.655,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.26527906505737747,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.568,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.2697094402719354,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.6172,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.2738859813879391,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.6275,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.25939271181980195,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.5639,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.2671651992817328,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.5919,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.2688565475090851,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.6312,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.2784248260332949,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.6124,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.26997057247712153,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.648,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.2707092694992218,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6108,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.2694317279781858,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.6399,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.26041601563873007,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.5889,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.25291835995919404,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.5911,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.26551700863248473,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.6089,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.25882948329790545,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.6298,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.2529947012716399,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.5914,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.32497365798332417,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.638,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.26296553022742203,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.6258,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2657709305402464,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.6495,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.25801033104181925,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.6019,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.25805367525126394,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.5931,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.26487929148998474,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.6711,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.25255797052501494,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.6088,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.25317945487768007,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.5703,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.2545495908249795,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.5878,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.32318784276902335,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.6273,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.25499189403717754,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.576,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.26227335079319297,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.5812,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2514697436323961,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.5734,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.27670627140391063,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.6589,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.26233287953524004,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.5956,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.25899185821758536,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.5981,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.2695534934217965,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.5788,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.2622202021562537,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.6263,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.26561851569401834,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.6423,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2519941661268092,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.602,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.2585815299560701,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6187,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.2579931313485816,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.6252,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.26138279560269373,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.6191,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.2652328876562207,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.6341,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.26814443178673353,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.6069,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.25716580395548105,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.5923,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.25262697851315913,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.598,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.25536363817600527,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.5786,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2515414352273644,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.5973,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3082923451611295,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.5738,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.2665831077382451,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.6079,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.2570054423385765,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.5884,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.2764118800977991,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.5664,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.2659015981177506,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.5945,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.25185649916492603,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.5709,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.26390095033023064,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.5989,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.2530850728794403,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.593,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.2565488992653013,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.5769,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.2611020063864013,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.6358,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.2464939099364865,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.5717,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.25190580400469725,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6154,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.2624308866231954,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.6333,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.2542363438650353,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.6209,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.25002614059735057,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.5733,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.2522774493843005,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.5709,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2592098056140366,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6038,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.2545528496860185,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6015,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.26574395128852707,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6239,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.2624335087446576,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.6181,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.2586614704436218,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6334,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.31003500089156544,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.6078,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.24237023956212553,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.5494,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.2625073020569576,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6057,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2539419606355253,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.6047,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.25884298262354777,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.5951,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.2660197122315272,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.6259,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.25655988921277334,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.568,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.2457259860690368,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.5566,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.2647745370183451,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6137,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.2526743886541512,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.5973,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.26287332654405005,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.5934,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.2614410753690634,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.6114,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.259783723422659,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.5961,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.24965207576444098,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.5706,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.2958426590398693,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.6272,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.25258297001548335,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.6006,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2611674989165653,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.6272,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.25524659220422524,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6174,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.2575177709932516,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.5956,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.2561395979203794,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.5667,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.25141728122486434,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.572,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.25964673762588375,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.6372,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.256690467072949,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.5884,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.2605600488715863,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6151,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.2520719685895574,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.586,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.2544316858533973,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.5852,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.25086193401636275,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.545,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2717178693247369,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.6135,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.26446663858778996,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.654,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.25710288834015854,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6077,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.2614053987061522,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.5887,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.25803580837193296,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.5922,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.25650163916574076,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.583,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.25422924226179444,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.5697,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.24957575142600866,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.5912,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.2515411538551669,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.5846,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.2601071856992811,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.5616,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.2639229721318898,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6368,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.24857817483163056,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.5593,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.2810784592988802,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7009,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.25502077384196564,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.5788,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.24704174727298683,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.5657,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.26397141045569056,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.6363,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.2524305059499819,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.5921,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.2697223617632766,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.6327,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.26555534553448457,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.608,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.25409868011892073,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.5747,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.252487846427678,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.5735,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2571510334874647,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.5865,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4760901811369371,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.542,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.26372826176481556,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.5884,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.26175507028814576,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6167,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.26047362323447143,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.6128,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2454540274824027,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.5388,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.2476784963430826,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.5393,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.24671440407493223,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.5966,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.2571397406086264,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.6121,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.2895940448351629,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.5805,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.27720052526167127,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.6128,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.2602744150458922,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.6157,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.2565662022949563,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.616,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.2562137901406146,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6134,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.2525816370267746,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.5834,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2528167501756759,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.5784,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.2557747985616951,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6049,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.2608436955663202,
+      "learning_rate": 0.0,
+      "loss": 0.5618,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 185984439222272.0,
+      "train_loss": 0.6505028414420593,
+      "train_runtime": 4096.0965,
+      "train_samples_per_second": 1.221,
+      "train_steps_per_second": 0.076
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 185984439222272.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..83dcfcd3ed6419e6ca1ff0b755b820036a3c711e
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2b44934230d56ecc96849e5d3a81e9d5c2c1fd21
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59ba24de4775f1291ab252b12fb0b3396485600c1e19fb4a466d9300e8b3c920
+size 671150064
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5a6bc71bf97737b8a369ad3a1c8bd22681dad003
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97cb23a344d78154074f4de89e6139a9a8c1a763b1c7b376fe682fb0d79a642e
+size 918507402
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d7655452f071ab8a74d711529d7b43f18b6dad4
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.9138792865490715,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.6749,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 2.0368399430231263,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.8516,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 1.8090355867591892,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.6712,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 2.762713388884416,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.6305,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.2794292650844834,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.5747,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.4340577236198235,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.6697,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.4329234538490498,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.5619,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.2892622484612177,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.3965,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 1.18604697598696,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.5188,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.1720977115105928,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.3695,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 1.091981284001823,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.3952,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.0817636217239512,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.2642,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.8318883186657168,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 1.3037,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.0485130732801884,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.3812,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 1.2774578260059815,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 1.519,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.0273239813225716,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 1.2408,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 1.2360823664444458,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 1.4999,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.9587869083216982,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 1.4482,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.9650065379953744,
+      "learning_rate": 0.0002,
+      "loss": 1.2293,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.0048012835563842,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 1.374,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.9973326787801201,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 1.4009,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.0109602492648537,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 1.3765,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 1.1201615977846267,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 1.4827,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.088872492346361,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 1.2704,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.941299634470907,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 1.242,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.0378299622697194,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 1.3798,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 1.0659314149162376,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 1.4877,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 1.0260255286150333,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 1.384,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 1.2156029529215326,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 1.402,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.924110135826946,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 1.4026,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.9627079859103094,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 1.2848,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 1.0238335211957148,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 1.2772,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.9429351205237804,
+      "learning_rate": 0.00019973673694024,
+      "loss": 1.3437,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.9525674623335832,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 1.3149,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 1.010878868364398,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 1.4853,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 1.232779975593512,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 1.4739,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.9746473280525739,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 1.3523,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 1.0390083937732542,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 1.3505,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 1.0323141306052472,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 1.4415,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.9607178707827044,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 1.2618,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.9680073099797065,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 1.3179,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 1.2077366195171064,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 1.3976,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.846005489716849,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 1.1975,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.9943262552019229,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 1.3532,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 1.1057388030171886,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 1.3299,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.962002465600493,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 1.3698,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 1.1248573964326298,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 1.4142,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.9802161923960824,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 1.3274,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.8727888982064798,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 1.3386,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.0204152929412373,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 1.4171,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.8940110524050955,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 1.3084,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.9124671150793904,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 1.3393,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 1.1643847828013019,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 1.5376,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.868101186153599,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 1.1898,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.9478830042552752,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 1.3806,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 1.451106724764284,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 1.3543,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 1.0230592284365352,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 1.4803,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 1.0204098395320889,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 1.4614,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.8651716760169919,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 1.3444,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.9703079648386799,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 1.3144,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.9979421069739579,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 1.4109,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 1.0324424303171396,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 1.3287,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.9785106362003406,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 1.3706,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.9420094371561448,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 1.3212,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.8905383774972145,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 1.2238,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.9378780481865792,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 1.2911,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 1.014808075570759,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 1.3628,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.9984699648320681,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 1.3013,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 1.0586654354662475,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 1.4085,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.9799763176865066,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 1.3189,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 1.0439261227003327,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 1.3016,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.9661114000480542,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 1.1563,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.9538595171022275,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 1.3644,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.9279761643712138,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 1.3325,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.0814585612185719,
+      "learning_rate": 0.000195815455670239,
+      "loss": 1.4022,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 1.0666505125215753,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 1.3003,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.9049371013277258,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 1.2489,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.920091059105391,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 1.2869,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.8923587994762451,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 1.277,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.8899519924175298,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 1.3673,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.9471778221939827,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 1.364,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.9079412832884837,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 1.2632,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.9870969446400838,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 1.3941,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.8702555425831592,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 1.2527,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.9895405215588456,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 1.3044,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.9884095855899157,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 1.294,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.9092570528379283,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 1.3049,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.8905401474373852,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 1.2914,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.9502646619662192,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 1.3151,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.929852852986334,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 1.2692,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 1.0467930508496202,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 1.3036,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.9318355455106352,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 1.3428,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.9581321473639015,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 1.1883,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.9096907067626209,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 1.3583,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.888898125705102,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 1.2552,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.9996772116461426,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 1.2999,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.9944042594023413,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 1.2563,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.7927854886824046,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 1.0828,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.9063951522976151,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 1.2878,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.0005464752308368,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 1.346,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 1.0773036588879836,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 1.4719,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.9902341156894181,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 1.3133,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.9202584867433755,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 1.2159,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.8928202430331159,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 1.2781,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.8786162241735016,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 1.2906,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.8818318634341324,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 1.2183,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 1.016850976151942,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 1.2798,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 1.1125718086502039,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 1.4232,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 1.0636099998751212,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 1.3642,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.9373493361047722,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 1.3849,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.9688474159342598,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 1.2488,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.8412981721836017,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 1.2149,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 1.0455522755060047,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 1.4969,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.8819959687210541,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 1.2709,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 1.0390606100176032,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 1.2512,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.8902083482060692,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 1.2918,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.8468591087899736,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 1.2147,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.9723033959967523,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 1.3743,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 1.0455319231677251,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 1.2743,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.9408439301893431,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 1.3922,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 1.0263620281862005,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 1.2693,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.9836156376006685,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 1.2372,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.8441357232790682,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 1.1687,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.8881386393138146,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 1.2011,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.8483056979385163,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 1.4101,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.894842476652938,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 1.3273,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.8948620187331842,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 1.2945,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.8603505640754253,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 1.2164,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.9260794605533464,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 1.3015,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.924634616629723,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 1.3371,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.8829903828411027,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 1.3814,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.9791894951236914,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 1.1984,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.8856840640474363,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 1.2842,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.8671119979857622,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 1.1748,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.8591943456089417,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 1.2471,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 1.0462864848081617,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 1.4317,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.879590039265427,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 1.2346,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.9258746466555289,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 1.2666,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 1.026660297097013,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 1.3158,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.9437344711094847,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 1.1704,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.9068493066951973,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 1.2705,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.8872777903966959,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 1.266,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.9348855211277328,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 1.3649,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.9697822035173204,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 1.2889,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 1.0179014202581467,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 1.288,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.981204112418224,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 1.2186,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.9115720707069629,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 1.2282,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.8356969676801423,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 1.1469,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 1.1175490899050453,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 1.4246,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.9341528226020359,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 1.3495,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.9584956431596955,
+      "learning_rate": 0.000177485710710289,
+      "loss": 1.2838,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.9478525671237141,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 1.2857,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 1.011731532566276,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 1.295,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.9212288319149741,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 1.2974,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.8918183647310739,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 1.3973,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.876633321580276,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 1.099,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.8734092115955532,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 1.2548,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.9509936530005848,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 1.3433,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.9876147790154572,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 1.3589,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.8404573991625482,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 1.2323,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.8718197594420647,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 1.303,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.854075594306203,
+      "learning_rate": 0.000173756913120621,
+      "loss": 1.245,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.8719731050503491,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 1.2371,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.9174901722888825,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 1.2911,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.843494724446203,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 1.1555,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.8269647090580537,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 1.2305,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.8538009131480417,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 1.2288,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.8200571038773096,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 1.1042,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.9985090587395257,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 1.2814,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.9043088861061557,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 1.2904,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.9720144762524422,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 1.2934,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.9429071983526343,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 1.2195,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.9922714848946325,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 1.386,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.8416804433344276,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 1.2287,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.9113976056217002,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 1.28,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.883839472667245,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 1.2327,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.7691670822483967,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 1.1303,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.9882033137881157,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 1.3492,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.9363276145321529,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 1.3594,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 1.044069691358691,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 1.2119,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.8122773071076308,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 1.0944,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.8127517020671314,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 1.2359,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.941263554003775,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 1.2414,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.8393462710847754,
+      "learning_rate": 0.000165592860169994,
+      "loss": 1.1938,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.9969899499563986,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 1.2476,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.8763384503459395,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 1.3152,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.8489189147455701,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 1.348,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.816074167030023,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 1.1743,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.8654989851443282,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 1.2716,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.9377395643561406,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 1.2408,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.8243605857754032,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 1.2268,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.9870712994522676,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 1.2249,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.9089766343458752,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 1.282,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.8809977197819951,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 1.2828,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.8972386215910175,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 1.244,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.8800981648292031,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 1.1939,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.9217855347492754,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 1.1995,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.9350718448332347,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 1.3191,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.8388711214458865,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 1.2952,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.9667195028151565,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 1.2687,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.947491495591094,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 1.2749,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.9418623165919426,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 1.2651,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.8811125597170616,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 1.331,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.8239429953850183,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 1.1908,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.8218011237910456,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 1.1712,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.792556863148284,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 1.158,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.7591037692200457,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 1.2378,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.9134765422918922,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 1.2695,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.9396029761811607,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 1.3359,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.9700687750462483,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 1.2743,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.972971356427299,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 1.2327,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.865753023900172,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 1.2224,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.8791308977470417,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 1.2124,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.8645115188012078,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 1.1832,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.8394381140725087,
+      "learning_rate": 0.000152669141192587,
+      "loss": 1.2047,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.8556568829596263,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 1.2568,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.7871574639241793,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 1.1903,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.9051963356204792,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 1.2939,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.9181618868204299,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 1.2466,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.8190508823611149,
+      "learning_rate": 0.000150448286344864,
+      "loss": 1.2468,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.8681848868003743,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1276,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 1.0010246288434874,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 1.3548,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.9054763292752849,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 1.1697,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.8853715938791729,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 1.1584,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8465500878684764,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 1.2424,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.9212138041787676,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 1.1762,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.9611719452243532,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 1.3087,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.8827786916757613,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 1.2625,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.8387676346336079,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 1.1485,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.8494532672088838,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 1.1955,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.870513532778789,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 1.2725,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.8910503853194656,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 1.1638,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.9040214513192539,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 1.2113,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.8851553164826061,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 1.3086,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.8234234579800335,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 1.2142,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.8892509071021439,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 1.1671,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.899688472903065,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 1.2334,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.8903533861688723,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 1.2552,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.9535230774328497,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 1.3028,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.9120573275838028,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 1.2507,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.8068854936446114,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 1.0953,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.8715233148733574,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 1.2744,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.8099131715159511,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 1.1879,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.9613678265565011,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 1.2067,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 1.2594485201695729,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 1.0867,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.881910095099135,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 1.2392,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 2.5785824113387066,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 1.2812,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.8581277392188488,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 1.2019,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.8908049815241303,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 1.2827,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.9328597459880543,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 1.2972,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.8819384667780155,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 1.2383,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.923991271431393,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 1.2378,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.852856759306124,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 1.2021,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.9517124148223832,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 1.2537,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.9943489920406982,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 1.298,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.9424839438430717,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 1.3916,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.8623889662495645,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 1.2428,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.8969029098876905,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 1.258,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.8685679563263606,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 1.2611,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.9380453581806747,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 1.2899,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.9128492251598493,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 1.1631,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.7921675182825618,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 1.1727,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.8741226372243694,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 1.1796,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.9624066895852776,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 1.2753,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.8623924847866271,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 1.2239,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.8754993437924429,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 1.2303,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.9196818395200124,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 1.2629,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.7725792736098621,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 1.1907,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.882188597768767,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 1.15,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.8440779606854073,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 1.1982,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.8230773009080197,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 1.2513,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.8372385893798061,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 1.2188,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.8872124615795473,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 1.18,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.9019740107544314,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 1.2596,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.8844802528802836,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 1.1859,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.8145917051737542,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 1.1844,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.8532928487511529,
+      "learning_rate": 0.000123117632211497,
+      "loss": 1.1622,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.8620582995155897,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 1.1533,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.8594768364941002,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 1.2915,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.7413865897201313,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 1.0711,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.7847519476510114,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 1.0843,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.8831366875265851,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 1.246,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.8985205839545358,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 1.2101,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.8917026992840591,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 1.1728,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.89096820779557,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 1.2552,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.8296500927521173,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 1.1848,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.8037711878867846,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 1.2396,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.9103068328992188,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 1.1845,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.8647079909838531,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 1.0828,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.8298068370580297,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 1.2582,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.9292697985594613,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 1.2346,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.7646422463520625,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 1.0674,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.8290176537474588,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 1.128,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.8399824958737899,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 1.1764,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.8971948834731232,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 1.214,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.9782768537494978,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 1.1912,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 1.23581044725264,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 1.2208,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.7668650057527652,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 1.1246,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.8836310045333498,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 1.2389,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8199386285851565,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 1.0902,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.8995441990206351,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 1.2153,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.7953824562393018,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 1.1021,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.8041206611379477,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 1.1319,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.760043694320985,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 1.0802,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.8402613615689775,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 1.1854,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.8370506536169573,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 1.3297,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.8756542853970231,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 1.167,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.9887192919613506,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 1.2035,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.8777343614550543,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 1.1611,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.8756411084607579,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 1.2932,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.7933193873769244,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 1.1609,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.8848359151078187,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 1.1338,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.7547677514371438,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 1.13,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.9183224690124575,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 1.2251,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.7649045744367469,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 1.1016,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.7984049291501666,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 1.1782,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.8430512686337509,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 1.2102,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.7590493446338831,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 1.2445,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 1.010969864158178,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 1.163,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.906697269700168,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 1.1266,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.7181242074258878,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 1.1142,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.8219401840378927,
+      "learning_rate": 0.0001,
+      "loss": 1.1711,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.9611267122420954,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 1.2323,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.9877413453035904,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 1.2141,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.8368823554633094,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 1.1423,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.8556826186373597,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 1.2504,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.8420396185143431,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 1.1729,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.948304811546288,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 1.2101,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.8559182631572183,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 1.1533,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.8954018911520171,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 1.1143,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.9147431991804071,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 1.1952,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.8148360981912695,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 1.1811,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.9031070907439837,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 1.227,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.860370451267779,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 1.1484,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.8152521797221018,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 1.1894,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.8398257018415263,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 1.2223,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.9008393901320019,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 1.2366,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.7753201517758531,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 1.1435,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.9073977985666707,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 1.2315,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.871600245938443,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 1.2583,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.8841066100524158,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 1.2344,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.8143250478949455,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 1.1382,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.7426915405897001,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 1.1406,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.8864003234551969,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 1.1746,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.7776617832611498,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 1.1841,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.9102059489215345,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 1.1214,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.9094619727242491,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 1.218,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.8033485035147858,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 1.1593,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.9253378190577354,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 1.3105,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.8591530532228647,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 1.2892,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.7988519283164964,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 1.161,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.9708577800367942,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 1.2504,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.848809978081281,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 1.2201,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.7685979840925905,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 1.1189,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.869108252327602,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 1.2034,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.7859812284213198,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 1.0982,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.8967511969562878,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 1.2392,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.7783806511972009,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 1.1966,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.7643003598307078,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 1.1042,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.8427919724796142,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 1.1576,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.902025440967138,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 1.2136,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.7404334436478929,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 1.083,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.9751026591922226,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 1.3347,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.8691363843427901,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 1.2633,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.9154939493330465,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 1.166,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.8451961940267377,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 1.0749,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.8398861442241642,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 1.1373,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.8172425329442209,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 1.1056,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 1.5755117690872584,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 1.1983,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.8784668339403303,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 1.1689,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.8906976548396875,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 1.2108,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.825483148194111,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 1.3095,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.7898151196765646,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 1.1102,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.7905430424069427,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 1.0784,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.8196770468772703,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 1.1636,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.888862504227803,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 1.2867,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.824844609674756,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 1.2041,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.9152932771920264,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 1.1682,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.8516991165662066,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 1.1531,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.8253284733353577,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 1.066,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.808754642859673,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 1.2327,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.8684342530267641,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 1.1757,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 1.0676860994053174,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 1.2155,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.7701555681194645,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 1.2048,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.9048160455852856,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 1.3376,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.7957470102026036,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 1.0878,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.8093378397441714,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 1.1519,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.8203721996256516,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 1.1611,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.811712204943522,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 1.158,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.8877301877619368,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 1.1956,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.7740209396211473,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 1.1664,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.8041506425838244,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 1.0035,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.6850965549453459,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.9761,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.7357434070457943,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 1.144,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.778451008261011,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 1.1921,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.8680554616182264,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 1.1567,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.7610705771800518,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 1.1568,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.8059528339762222,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 1.2073,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.8428663715678696,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 1.1649,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.9188786174992472,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 1.29,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.8756111438473054,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 1.2236,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.8733922333991385,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 1.1981,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.7711392324119829,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 1.1458,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.8062835770218935,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 1.1183,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.8747371063888001,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 1.2223,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.8759256057695535,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 1.141,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.7665422523949811,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 1.0992,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.8298592894443804,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 1.0957,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.8080989355798822,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 1.1065,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.9143197985164898,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 1.2077,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.7882212969542681,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 1.123,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.7852271835858652,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 1.1432,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.8526592754741675,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 1.086,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.8495245556948786,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 1.2061,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.9870834284353387,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 1.1034,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.8040489882141855,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 1.1533,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.7964631759376904,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 1.1348,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 1.0023008023943585,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 1.1666,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.7331303069308008,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 1.0718,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.8663504773008042,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 1.2721,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.7492676947718987,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 1.0432,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.7401120500297591,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 1.1536,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.8640112094653735,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 1.2418,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.834158086197472,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 1.1254,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8096342566815984,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 1.1253,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.7767589861691669,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 1.2038,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.7587020261672197,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 1.0089,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.8011720535678255,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 1.191,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.6753522844900665,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 1.0858,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.7410365911986037,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 1.1445,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.8060757851701591,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 1.2159,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.8420427461230551,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 1.1538,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.8784266723731003,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 1.2057,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.8013200004781331,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 1.1869,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.8583871642996956,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 1.1606,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.7462787557206001,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 1.0587,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.9324702224054695,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 1.1917,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.825952426854641,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 1.145,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 1.03145563184466,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 1.1587,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.7296815917745646,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 1.1109,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.7828315182565447,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 1.1362,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.8661288824834599,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 1.1877,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.8784570468123847,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 1.1941,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.8287026908615008,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 1.1556,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.8357885647659248,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 1.1265,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.9117249328448946,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 1.1627,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.8301997254234771,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 1.1352,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.8634018249078147,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 1.1479,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.8329077897134307,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 1.018,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.850231493224787,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 1.1742,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.8570417611972676,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 1.1468,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.7855160206399039,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 1.016,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.8057271657502297,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 1.1755,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.8720646033479583,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 1.1208,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.8174618395150179,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 1.2449,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.8984493070072344,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 1.1483,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.8676622249630999,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 1.029,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.9174839212355238,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 1.1039,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.7786142188089832,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 1.0746,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.9909890263435749,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 1.2087,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 1.0823583993413264,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 1.1421,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.7866294988212094,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 1.1617,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.7980845840357095,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 1.1537,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.8382061072113046,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 1.1559,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.7508694928613053,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 1.092,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.8447023248602636,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 1.2368,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.8602381239185524,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 1.1196,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.8853054458029322,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 1.2423,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.8939026391356107,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 1.1277,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.8807142157738884,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 1.2526,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.8324583410129928,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 1.0743,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.7490330913568131,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 1.1246,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.8487037505899271,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 1.2339,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.8188830641183935,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 1.1554,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.8660395885091415,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 1.1092,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.8736838218162293,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 1.1988,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.8310946470978122,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 1.1999,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.8990152018181198,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 1.1053,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.7284122434882887,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 1.0132,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.8273713313369169,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 1.1439,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.798738933773401,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 1.0502,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.9731526718979308,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 1.145,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.7761081087955582,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 1.1722,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.7799187025271342,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 1.1349,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.7739626130943175,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 1.0719,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.809233978771651,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 1.1558,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.8605355086619487,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 1.244,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.78285209419252,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 1.1517,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.9576645934425764,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 1.2088,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.8630992051401077,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 1.0503,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.7960993720786123,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 1.0288,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.8196348688283756,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 1.183,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.7687564956603108,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 1.1636,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.8018095926887172,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 1.0865,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.7964486540536525,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 1.0577,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.9073721975513531,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 1.2657,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.7342122892244989,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 1.0574,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.9811847783919604,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 1.2359,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.7612781244581198,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 1.0913,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.8234719284665559,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 1.1103,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.7695635781722373,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 1.089,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.7788976075736265,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 1.1197,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.8206755048873753,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 1.1847,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.7634961509457562,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 1.0367,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.8036025900599119,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 1.1318,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.7062609717479594,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 1.0694,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.7701703193316796,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 1.0875,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.9649938639591058,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 1.208,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.7555581523592024,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 1.1051,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.8368832116248573,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 1.1158,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.7286985238454796,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 1.1732,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.870187735043427,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 1.0843,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.7341923862261764,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 1.1035,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.78122639104198,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 1.0914,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.7325166160566458,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 1.1732,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.7922310272884773,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 1.1122,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.8753271390663528,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 1.1513,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.9131560606826612,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 1.0925,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.7698842542187385,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 1.1716,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.8426807492438998,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 1.1362,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.783156835957247,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 1.069,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.779858408711323,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 1.1112,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.8247142532205438,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 1.0457,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.8560843223223137,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 1.2159,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.7845474164901701,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 1.1098,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.7449190731629902,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 1.0733,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.8630092715184896,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 1.1833,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.7867104021920172,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 1.1309,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.8244636276482338,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 1.234,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.8138146321719298,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 1.1515,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.8425524497553843,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 1.1644,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.8547259869847009,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 1.1667,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.7244799287511947,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 1.0842,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.7524698818608209,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 1.1178,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.9881487441349832,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 1.3182,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.7576640311695998,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 1.0744,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.9096052404769226,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 1.1455,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.7379987979671901,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 1.208,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.7905102802551862,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 1.055,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.8126527688723885,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 1.1168,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.8383644272793868,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 1.2018,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.8729393506383202,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 1.0735,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.7951320488942527,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 1.2413,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.8324150961444802,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 1.1574,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.9271836872000464,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 1.1051,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.8522926371827139,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 1.0549,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.7377990293794885,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 1.1044,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.7792385782548877,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 1.0408,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.7603970847804558,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 1.2524,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.8857803738015847,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 1.0188,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.7357070656600264,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 1.0963,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.8473491473253034,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 1.0699,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.8231635478717917,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 1.018,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.8622047371192733,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 1.1781,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.8044949620138342,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 1.0902,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.7789169141323721,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 1.1454,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.8896580003141731,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 1.1064,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.783026117101944,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 1.1428,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.8550674103842599,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 1.2133,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.7838714937143493,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 1.0849,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.8338129750479177,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 1.1078,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.8534721867473872,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 1.0528,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.7947880348906006,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 1.0495,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.8090524858380864,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 1.1755,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.6974919434108282,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 1.0676,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.6628997531757636,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.9407,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.7623591178079464,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 1.043,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.7233254791162921,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 1.1492,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.8506229695632855,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 1.0879,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.9005950117591424,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 1.0699,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.6814512621221664,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 1.0912,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.7992049769516041,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 1.0538,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.8192409966078561,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 1.116,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.8669150974095468,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 1.1496,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8239210079645767,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 1.1311,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.9021066280304881,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 1.2361,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.8013447776589625,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 1.0475,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.7673623371996414,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 1.0197,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.8324137919874324,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 1.088,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.7144867605916154,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 1.0847,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.8302290746064859,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 1.094,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.915754184283194,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 1.1402,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.8174700752216338,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 1.2386,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.7545048285638264,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.9968,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.7523805546706704,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 1.0887,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.8255564351449471,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 1.1428,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.7682235946251612,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 1.1986,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.7508838695782295,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 1.1666,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.7272082061501186,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 1.0427,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.8136674796372391,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 1.1624,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.9471246322793946,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 1.1944,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.830793594639552,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 1.1162,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.7829685338313231,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 1.0509,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.7677342082631756,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 1.1581,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.8580155229574348,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 1.2314,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.8315893544832171,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 1.1727,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.790733072519165,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 1.0775,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.7797569176739247,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 1.0605,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.7558158376183497,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 1.0332,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8161929460899198,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 1.1533,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.8758741339277274,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 1.2084,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.8491665046048456,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 1.1098,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.8289959874690686,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 1.113,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.8859879330617857,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 1.2009,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.7944978619957024,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 1.2376,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.8411628207305331,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 1.0887,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.7866710464305886,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 1.0701,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.7260546876982716,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 1.1078,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.9615683500825383,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 1.0852,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.9611345980076806,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 1.2499,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.8612297012148447,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 1.1793,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.8216178579217599,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 1.1088,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.7562499369168286,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 1.1269,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.777584454108545,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 1.1456,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.82330199497272,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 1.1404,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.8305045181413623,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 1.1219,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.826489086493862,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 1.1179,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.8079475819208309,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 1.0196,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.8318839031216161,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 1.1863,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.7703937170177031,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 1.1027,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.70822487933415,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 1.028,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.8026825675935317,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 1.092,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.7721965398534467,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.9474,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.8220840958840194,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 1.2077,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.7661086225593968,
+      "learning_rate": 0.0,
+      "loss": 1.0666,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 203897771933696.0,
+      "train_loss": 1.2123233310699464,
+      "train_runtime": 6268.0756,
+      "train_samples_per_second": 1.595,
+      "train_steps_per_second": 0.1
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 203897771933696.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbb7e48d87741e690140e127f0c8e291b49c2200
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..95612d5387273527cd475667c07dd6094e46d4c8
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6b5a0f19c62ddf44c3b3f88a80303cd30828b6a23c325502b62d35ab90c98de
+size 671150064
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8721a47207e4d1a2a764da69a07a0e0167fbdd63
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0572d0914e749199035e01a17b945934ce0ce81d1c0d1e9cc044983fb4e5172
+size 918507402
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d96842d70d28f0be419a5b5dd303b9d9ae9650c1
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 2.167579092516437,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.828,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 2.340629280044389,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.9176,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.811463906297539,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.7397,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.747010319714731,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.586,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 1.520092988296709,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.7585,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 1.3268982929526139,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.6026,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 1.4764488386549275,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.6617,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.6279768120710967,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.573,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 1.4270807326087176,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.5528,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.2428428678982786,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.3892,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 1.2401580956271006,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.3806,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.2819503399116507,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.396,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 1.1882915733386645,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.4601,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.0903858341710697,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.344,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 1.0596846531636785,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 1.3555,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.0634815912562547,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.4242,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 1.0878252629197922,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 1.3516,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 1.1800010976164466,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.5403,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 1.1640427666915913,
+      "learning_rate": 0.0001,
+      "loss": 1.4014,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.266209495132437,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.5235,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.9978913092746845,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 1.2294,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 1.1342616642751515,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.4345,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 1.215812237535266,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 1.3902,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.0844024644780883,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.2559,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.9876516028052728,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 1.3326,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 1.0035043823088348,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 1.4357,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.9948732657309445,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 1.3949,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.044231166970953,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.3937,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 1.0001653312399295,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 1.3577,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 1.1236353066110405,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 1.387,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.9847793391832763,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 1.3356,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.0734795709259823,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 1.345,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 1.0018645283773786,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 1.3844,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.9741280531955702,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 1.2712,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 1.019580278753891,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 1.3357,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.9806417285455257,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 1.3431,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.9902082070954336,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 1.4262,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.8904579435910817,
+      "learning_rate": 0.0002,
+      "loss": 1.3145,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.9830855617998796,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 1.3633,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.0024726704595865,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 1.3568,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.9739528854859765,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 1.3164,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.8801591109479502,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 1.2211,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 1.0260757591954461,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 1.291,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.1031534942231496,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 1.4759,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 1.0606001512863072,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 1.4111,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.9840640012343169,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 1.2385,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 1.1052577461032573,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 1.285,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.9118307388774117,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 1.2741,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 1.004695540398904,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 1.3496,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.0407289757019198,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 1.3497,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.957036327448944,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 1.2609,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.0044911323790189,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 1.3899,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 1.0623053762303933,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 1.2557,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.9863645586943295,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 1.3906,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 1.049058703594308,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 1.3907,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 1.0807933613915983,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 1.2593,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.8880685140279246,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 1.298,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 1.0984359847011123,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 1.3714,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 1.0017796656325524,
+      "learning_rate": 0.000199851886084842,
+      "loss": 1.4406,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.9497230746477366,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 1.252,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 1.0207357838139706,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 1.2045,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 1.1052157003992444,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 1.3368,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 1.15820775048525,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 1.3323,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 1.0393811648711768,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 1.4127,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.9454950054465096,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 1.2724,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.8458612707334442,
+      "learning_rate": 0.00019973673694024,
+      "loss": 1.1787,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 1.175921001498925,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 1.4735,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.9928286314149518,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 1.2568,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.9383971568954167,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 1.3237,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 1.0146098396729828,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 1.3568,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.9449158480327011,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 1.2383,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.9737420940105619,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 1.3425,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 1.0702776392118603,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 1.2921,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.8978325266237281,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 1.3866,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.0761678411573041,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 1.3481,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 1.053564226804702,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 1.3846,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.9685816836019822,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 1.2801,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 1.2255497280368406,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 1.4378,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 1.0024763065869442,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 1.2852,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.9526992946632433,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 1.2434,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.8944689402544411,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 1.2831,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 1.0642179704409769,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 1.3311,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.957225504372996,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 1.3431,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 1.0173050328208,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 1.3324,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 1.020810730538368,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 1.3101,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.9494397858523248,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 1.2692,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.8914234539743199,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 1.2945,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.9344342732976146,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 1.3109,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.985271247374639,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 1.4008,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.9247507759436328,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 1.2126,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.9521122192113574,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 1.3568,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.919839449603137,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 1.272,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.9313738933256456,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 1.3046,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.9274040344808508,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 1.3611,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.932158258853334,
+      "learning_rate": 0.000198910508011824,
+      "loss": 1.2141,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 1.2689889165504267,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 1.4466,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.981717390104989,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 1.2913,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 1.042409120435065,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 1.3284,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 1.0110680752501544,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 1.4257,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.8952232892799127,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 1.1767,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 1.06830466342513,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 1.3548,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 1.2282347155104663,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 1.4166,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.8658617487000834,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 1.2353,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.9595117376855367,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 1.2504,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.8922764304904715,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 1.2383,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.9569832187758008,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 1.2801,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.9279072735190277,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 1.3134,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.9343160937396461,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 1.3676,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.9729683033277049,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 1.313,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 1.0499300246022472,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 1.3321,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.9508681720875867,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 1.2798,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.9677292544725961,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 1.4221,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 1.0112443013225598,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 1.263,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 1.014832326054943,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 1.3264,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 1.013409537107313,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 1.3215,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 1.0274324617959099,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 1.3629,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 1.0713761880177823,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 1.2939,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 1.068251234875904,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 1.447,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 1.0559978592286707,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 1.3106,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.9247675055087754,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 1.3401,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.9264122038131352,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 1.3146,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.914783829048558,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 1.3073,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 1.0154466441114909,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 1.3817,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.910123856772834,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 1.234,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.1034973206566288,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 1.3984,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 1.0167835565624685,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 1.3065,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.9541771221806903,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 1.311,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 1.072005491293411,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 1.2649,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.9222742105301948,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 1.2939,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.9182690156303601,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 1.2296,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.873328914275884,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 1.14,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.89808663605913,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 1.2603,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.8942697931590334,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 1.283,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.9863497164075619,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 1.3498,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 1.065122487379581,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 1.4105,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.9588226639761218,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 1.3338,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 1.1423314012184513,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 1.2981,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.9877321537388655,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 1.1558,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.9823929913907383,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 1.2956,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 1.0129620806874113,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 1.3666,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 1.1160502930942982,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 1.2756,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.9959529425803797,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 1.3355,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.9717112645235756,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 1.2239,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.9263133806847209,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 1.2585,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 1.1246517730989027,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 1.1816,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 1.0210343295751094,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 1.2572,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 1.1435629451680012,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 1.2142,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.8973726237006526,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 1.2267,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.9375307020959045,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 1.1957,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.8877654959306899,
+      "learning_rate": 0.000195815455670239,
+      "loss": 1.3156,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.9237560489461866,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 1.219,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.9940201908718649,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 1.3696,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.9504882303629454,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 1.2064,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.9333353719171309,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 1.2968,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.9698434067734869,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 1.34,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.9553092369735782,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 1.2127,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.9009311579341852,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 1.2811,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.9870694380783888,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 1.3883,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.9961378055276119,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 1.4485,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 1.0449183624088734,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 1.3765,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.8615043474531041,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 1.0767,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 1.0528187542828291,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 1.2594,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 1.1340945053685607,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 1.3687,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.9501450942419837,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 1.2395,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 1.045675073511293,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 1.4314,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.8775108529320873,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 1.233,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 1.0583587466802307,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 1.3102,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.9559765272138694,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 1.2742,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.9777314716016529,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 1.3153,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 1.002553842293866,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 1.2063,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.9095180233066996,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 1.1509,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.9465664389556642,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 1.2949,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.9679809142755188,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 1.2996,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 1.0395885787436914,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 1.4069,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.8891529964391287,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 1.2993,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.9225541873816892,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 1.219,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.8655317895338619,
+      "learning_rate": 0.000193579174539646,
+      "loss": 1.2356,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.9155349545526035,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 1.2478,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.9919240587073496,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 1.2888,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.9112547413510824,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 1.2623,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.9308186740534311,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 1.2761,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.9196186440867572,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 1.254,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.9142485565040559,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 1.2618,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.848240080243922,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 1.2474,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 1.010522331993527,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 1.2613,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.9711072419826322,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 1.3189,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.9538559181814769,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 1.3093,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 1.0971831439930473,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 1.4779,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.8851820843384431,
+      "learning_rate": 0.000192437472817166,
+      "loss": 1.1911,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.9551242000889022,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 1.3758,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.9441705161533531,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 1.2271,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.9836464031073223,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 1.2756,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.908853294679305,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 1.1848,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.8494105866506043,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 1.1351,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.9247520768609205,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 1.3106,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.9433884410895655,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 1.3105,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 1.0114886408840793,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 1.4155,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.9676678745331979,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 1.313,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 1.1155683911355467,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 1.2428,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.879055283601944,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 1.1823,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 1.0153817814809485,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 1.2951,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.9830999073990369,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 1.2347,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.9347508555457964,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 1.2941,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.936274005635992,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 1.2885,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 1.104866636661514,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 1.2579,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.9673565382699655,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 1.2739,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 1.0272681517131834,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 1.4015,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.908957962895597,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 1.1708,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.9940800186635145,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 1.2064,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.9269814183548376,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 1.1778,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.9549520056237327,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 1.3004,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.9302851719343455,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 1.2855,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.9543168060402824,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 1.2612,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.9215352284559458,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 1.3013,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.9373405519430387,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 1.3069,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.9109028462151921,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 1.289,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 1.0417375130960227,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 1.3804,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.8992185336285579,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 1.2244,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.9498360222597024,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 1.2513,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.9137276089471331,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 1.2456,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.9343334801114195,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 1.2453,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.902300229292226,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 1.2514,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.9627526933565559,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 1.2534,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.9707956350087426,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 1.1498,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.1040611555514872,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 1.3959,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.957089113833274,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 1.2859,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.8774015715700482,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 1.1675,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.9433319527481238,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 1.143,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 1.0214888795909183,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 1.2955,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.9616667839156496,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 1.2678,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.8891852551275778,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 1.2647,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.8802953650380777,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 1.1692,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.8313635670238767,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 1.2354,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 1.0199744757784006,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 1.1035,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.8611064620021869,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 1.2388,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.8735673609655413,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 1.1727,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.8901145078375076,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 1.2797,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.9652231294142449,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 1.3121,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.9104393378234763,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 1.1945,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1.0112227771918163,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 1.2843,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.8630469893675877,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 1.2034,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.9188347046864217,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 1.1626,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.9278100650647194,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 1.1024,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 1.0464323384986105,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 1.3174,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.9851443925974323,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 1.2532,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 1.0030662498592748,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 1.3867,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.8551821476650675,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 1.1021,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.916813509295243,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 1.1815,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.9293644866629288,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 1.2278,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.9157513564772256,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 1.342,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.882577045734966,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 1.2078,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 1.0411934134353433,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 1.3107,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 1.0487036682648194,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 1.2408,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.996003176964268,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 1.3839,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.9993503556381033,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 1.2378,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.9735271694798517,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 1.2637,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 1.0260426667918034,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 1.295,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 1.8934073314300326,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 1.2806,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 1.1055103869913467,
+      "learning_rate": 0.000184036060115244,
+      "loss": 1.1872,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.9182298549291064,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 1.2763,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.9957870148143394,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 1.2213,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 1.0231771013527717,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 1.3007,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 1.053869176624131,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 1.228,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 1.0541928753871113,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 1.3133,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 1.048461099911668,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 1.2627,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.9680026687318783,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 1.2683,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.9159924136404415,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 1.1734,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.9493743000419643,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 1.3051,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.7984846876543369,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 1.2875,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 1.0588105456539831,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 1.3482,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.8617306722077073,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 1.2347,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.9201580389065354,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 1.3656,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.9632315380099041,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 1.3028,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 1.036405616549462,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 1.2665,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.8722582297782859,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 1.2154,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.9598056669866919,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 1.2157,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.9093669858230348,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 1.217,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.9301820924793702,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 1.099,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.8515405402797095,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 1.1085,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.9886837697985009,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 1.2912,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.9108321566956064,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 1.3845,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 1.020152530259783,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 1.2758,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.9135337960724216,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 1.2935,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 1.003246670931465,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 1.2846,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.9908671594294807,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 1.2798,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 1.0046666460518803,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 1.3153,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 1.0050270897576201,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 1.2318,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.8898448753827383,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 1.2443,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.8456349163049246,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 1.1138,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.9006103289307575,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 1.3603,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 1.1364506597841737,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 1.3077,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.891233671680888,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 1.2842,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.9822838216631534,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 1.1698,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.9004032270433734,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 1.3045,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.8809100040469414,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 1.3591,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 1.0363156271259157,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 1.2574,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.8941089337345295,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 1.1797,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.9854339490702365,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 1.1502,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 1.162057797328878,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 1.4165,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.8667128774735741,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 1.0879,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.8755236610189415,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 1.1483,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 1.0760108468427834,
+      "learning_rate": 0.000177485710710289,
+      "loss": 1.3106,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 1.0103325474819502,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 1.2281,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.8605645733888942,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 1.2049,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.8892076278648906,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 1.3148,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.9676847990497317,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 1.2684,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 1.0896990899045311,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 1.3011,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.948038100131218,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 1.2795,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.9585751723182749,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 1.2842,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.8352758228615104,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 1.1673,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.9235985259875558,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 1.1964,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.9045244602385455,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 1.2193,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.8395526289182972,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 1.1979,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.9196580961351944,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 1.1456,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.8143014058708408,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 1.08,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 1.3447886704942074,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 1.3336,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.9149130861662886,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 1.2199,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.9363131784977247,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 1.3458,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.9245883146194218,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 1.2124,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.8623535081047469,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 1.1537,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.8261285070810973,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 1.2583,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 1.0443091644941611,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 1.3473,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 1.1936136897937293,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 1.3646,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.9618183784456393,
+      "learning_rate": 0.000173756913120621,
+      "loss": 1.195,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.8834444187506799,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 1.1462,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 1.0644742450260856,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 1.2692,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.9763081988106593,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 1.3289,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.9067380840876519,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 1.2101,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.9877674117491254,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 1.2473,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 1.0166560289485393,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 1.2879,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.8867253833662876,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 1.204,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 1.0222895737290478,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 1.3303,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.9297052904091985,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 1.2158,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 1.113950668125169,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 1.4063,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.8315094003529999,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 1.2375,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 1.0132794538892895,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 1.3325,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.9301648106998512,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 1.314,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.8387166059271942,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 1.2933,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.9491040601324949,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 1.3023,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.9589786429040578,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 1.1919,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.8211356124008726,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 1.1531,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.8651844248084547,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 1.1853,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.9587234574931144,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 1.2009,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.9638887805121406,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 1.3002,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.9884632423837139,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 1.2294,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.8617371245052429,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 1.1756,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.8570029343254111,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 1.1458,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 1.0403499538130299,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 1.2649,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 1.0701402360745733,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 1.3516,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8046004799379116,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 1.1243,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 1.0896556455414892,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 1.3964,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.8051493078826016,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 1.1317,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.8861789311545909,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 1.2176,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.9581457318097438,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 1.2752,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.9500815430533931,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 1.229,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.8983829589570962,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 1.1718,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.9563371854485535,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 1.2362,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.8612559432875038,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 1.2893,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.9028864105321798,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 1.1828,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.8451192551757782,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 1.1265,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.9533224377402716,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 1.209,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 1.0147529765631986,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 1.2257,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 1.031808782707111,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 1.3177,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.911147895751784,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 1.2467,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.7817909097309222,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 1.1432,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.7444889968847276,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 1.0799,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.8492631593144773,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 1.1427,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.9450214645654191,
+      "learning_rate": 0.000165592860169994,
+      "loss": 1.2868,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.8397588768973042,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 1.2199,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.8322979893800689,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 1.2057,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.8670377342574328,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 1.1538,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.8914144967857235,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 1.1952,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.9502443612778761,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 1.2334,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.98482104661955,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 1.249,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.8645467629715614,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 1.174,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.8561391927441676,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 1.1494,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.9171929313181237,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 1.2708,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 1.0114481827984823,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 1.262,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.7901886201121449,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.9509,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.9286085733043485,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 1.2093,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.8525243507575246,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 1.2247,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.8049172689583466,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 1.1541,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.898689792997307,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 1.1567,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.944213975316119,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 1.1869,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.9682693375087824,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 1.2967,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.9018898556139062,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 1.1955,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.8668345825876101,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 1.3118,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.9774777295478445,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 1.219,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.8041179042413573,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 1.1725,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.9130745985984293,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 1.1321,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.8711746954473047,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 1.1676,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.9128330506885919,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 1.2505,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.845758888302076,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 1.2259,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.9059778839638238,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 1.2066,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.8653822503261192,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 1.071,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.8687364719367553,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 1.2495,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.8680808216538957,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 1.2001,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.9619686817861596,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 1.2892,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.9611654419173977,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 1.2643,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.7915960551513423,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 1.0126,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.8065933118797741,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 1.1837,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.7630648072457108,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.9984,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.8008835614895283,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 1.1827,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.8181476801131583,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 1.1794,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.9518258795235218,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 1.3664,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.9740622816903404,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 1.2787,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.8894350998025817,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 1.2131,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.8033594520828705,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 1.1828,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.9186622687647027,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 1.2594,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.9134888956592484,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 1.2057,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.9500637042175568,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 1.207,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.849598717746634,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 1.0906,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 1.0259657028423845,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 1.3111,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 1.0309066909033666,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 1.2006,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.9460127280243473,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 1.2872,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.9494818972639643,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 1.0666,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.9948125878007591,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 1.1886,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 1.0347143496320934,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 1.1577,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.8971181852404114,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 1.1707,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.8744963963866264,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 1.1817,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.873626962980643,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 1.173,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.8742899718656414,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 1.1749,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 1.030382215523112,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 1.4224,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.9538569427641035,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 1.236,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.925636229189722,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 1.2947,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.9716062996409484,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 1.2085,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.817533441455966,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 1.1791,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.7932440760120276,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 1.1963,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.943311651505502,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 1.3386,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.977028135598457,
+      "learning_rate": 0.000152669141192587,
+      "loss": 1.3633,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 1.0906113450970434,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 1.2432,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.8424851045981453,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 1.168,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.8615709350842159,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 1.1567,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 1.0404299995470772,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 1.3156,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.8639625636960955,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 1.2002,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 1.0379110675975693,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 1.2018,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.9839761304214002,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 1.0979,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.9624304807291328,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 1.3084,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.900987750667034,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 1.2027,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.8888875144514918,
+      "learning_rate": 0.000150448286344864,
+      "loss": 1.3207,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.8850569429393079,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 1.2416,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.9003696024963973,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1479,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 1.0064359203785187,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 1.1769,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.9014492959862608,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 1.2437,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.8582665088283009,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 1.2826,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.9378752094731411,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 1.276,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.9320151471539153,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 1.1669,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.8645625988589353,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 1.1414,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.9461016811211961,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 1.1583,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.170475064084964,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 1.2634,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.8528015522455052,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 1.1828,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.8948649989942181,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 1.1555,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.956391413560586,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 1.0984,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.8815982203572552,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 1.1565,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.9366137358254796,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 1.1641,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.9463767041942464,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 1.2412,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.8635682429287176,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 1.1995,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.9412634696119908,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 1.256,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.894710543323146,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 1.1971,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.9849238469492475,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 1.0918,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.9372613714430934,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 1.304,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.8366575987171679,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 1.1435,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 1.0183573724764445,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 1.372,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.921436388687488,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 1.2401,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 1.0126133537347326,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 1.2933,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.9366176126930088,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 1.2144,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 1.039845091753781,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 1.2504,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.831029380673922,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 1.1508,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.8257437837880253,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 1.1706,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.8023515201237144,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 1.1508,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.9545116084936521,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 1.3051,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.9717442585412174,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 1.3213,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.9015444991867029,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 1.1661,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.8453162594038274,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 1.2304,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.8760218476506566,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 1.2186,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.7876474094502752,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 1.075,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.7993246547885368,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 1.231,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.7907090498056094,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 1.1443,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.7511377628701228,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 1.1022,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 1.0602160883969263,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 1.1419,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.8499695191433811,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 1.1788,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.8540057141934249,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 1.105,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.8102920946458256,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 1.1683,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.9072627599036039,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 1.088,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.9586398821405642,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 1.2484,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.9290000247371769,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 1.1843,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.8375789473026685,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 1.0358,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.7635571324142661,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 1.1445,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 1.045797503899567,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 1.2421,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 1.0109307529063931,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 1.2873,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.797021878143829,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 1.2203,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.9700917177296354,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 1.1761,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.890026042051327,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 1.1883,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.765185289338175,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 1.1012,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.8140561721789563,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 1.1017,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.9666521023648389,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 1.1211,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.9393299199746825,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 1.2397,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.894655604863808,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 1.1175,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.8959521328951345,
+      "learning_rate": 0.000136706389208128,
+      "loss": 1.138,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.9818068524694171,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 1.137,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 1.027296109586849,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 1.1507,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.8678449288401437,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 1.1469,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 1.0031997037428155,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 1.2433,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 1.0515492628817857,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 1.2729,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.8885796845124188,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 1.1275,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.8641143500851984,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 1.2576,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.8154811162618586,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 1.0765,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.9568018074614199,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 1.1195,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.8108302299524088,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 1.053,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.9409675732775402,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 1.127,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.7781864660886626,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 1.1329,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 1.0199127709160234,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 1.2385,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.8221811253224244,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 1.2236,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.9463573458104121,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 1.2651,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.8597680591743613,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 1.0963,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.9237614005652235,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 1.1679,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.9161415729139808,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 1.1218,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.7773862816198334,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 1.2071,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.8763697877745098,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 1.1922,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.8436430947992818,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 1.1839,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.8542536722190046,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 1.2099,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.8718154209836556,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 1.1767,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.814140019880975,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 1.1537,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.8889221338176805,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 1.2299,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.7808170171112502,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 1.1895,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.8232553022619731,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 1.2091,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.8391444436897774,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 1.1803,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.8753444917553742,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 1.1082,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.7924003048200808,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 1.1342,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.8983735111525624,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 1.2484,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.895102530864573,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 1.1044,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.9445868985759234,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 1.121,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.9149240583640527,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 1.1474,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.8571405322515444,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 1.1563,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.8481113794861626,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 1.1171,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.8603903731508595,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 1.1545,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.7662421829165785,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 1.048,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.8706681219444926,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 1.1907,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.875195678660521,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 1.1865,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.797748206572489,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 1.2685,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.8273035537862294,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 1.2624,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.8766792540528516,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 1.0499,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.9423730978153396,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 1.174,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.8276737854919352,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 1.1171,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.8108921482599494,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 1.0962,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.8387867133920371,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 1.1382,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.9413873346510828,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 1.2573,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.883006129224719,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 1.1496,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.8199461868571233,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 1.143,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.8553685705620094,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 1.2265,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.8820477217672932,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 1.2634,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.8601839000066415,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 1.247,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.8527697759820766,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 1.1594,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.8123714400910506,
+      "learning_rate": 0.000123117632211497,
+      "loss": 1.1541,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.8388929535371012,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 1.2318,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.9072499746597118,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 1.2327,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.745225852061896,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 1.1105,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.8784362755933546,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 1.2133,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.8153133519550086,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 1.1971,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.7811186057897588,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 1.0931,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.9326536326980271,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 1.1517,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.8216914978062639,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 1.0912,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.8476962507001223,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 1.1957,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.8596187835809099,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 1.1555,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.8298184772484226,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 1.1074,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.9609701936506492,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 1.2726,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.8680508025657486,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 1.1339,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.9925839781085051,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 1.2028,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.8125320750678805,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 1.0625,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.8768511805834935,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 1.2186,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.8345742788681829,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 1.1813,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.8959461578622999,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 1.1638,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.9924411869619153,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 1.1712,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.8943053158039393,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 1.1679,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.805038370805271,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 1.2985,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.8873055710491184,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 1.197,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.7816032860712533,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 1.0355,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.906300282180546,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 1.1816,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.7925010420648496,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 1.1191,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.8376692937905297,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 1.1698,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.8948600707756753,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 1.2028,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.8952116689846026,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 1.1844,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.9094589278333447,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 1.1354,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.8536043917642891,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 1.1392,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.9733923853117847,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 1.2189,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.8964503801179634,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 1.1615,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.8877921864629532,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 1.1435,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.8372328560031221,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 1.1934,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.9079557906978756,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 1.1476,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.8510474966571356,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 1.2275,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.8408931790931111,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 1.1917,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.8343733813573859,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 1.1732,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.8289961257458175,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 1.1986,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.8555001028611945,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 1.1986,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.8264825870352727,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 1.1929,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.8553948588686492,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 1.1033,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.8380899044644874,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 1.1686,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.8727679175975,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 1.041,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.8180328740580982,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 1.1672,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8400486300979757,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 1.2248,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.8668030633717136,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 1.1847,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.9183371992913884,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 1.1679,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.759898728911129,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 1.0188,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.8840110162639573,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 1.0226,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.7914148929556654,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 1.0652,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.7403012262197173,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 1.115,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.8253809044516809,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 1.1122,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.8853790879504476,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 1.1727,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.909555754043428,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 1.0891,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.8425311868030075,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 1.1894,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.812821876426759,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 1.1576,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.9936089741508148,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 1.2062,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.8209545933895037,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 1.2334,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.9796965099618715,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 1.2203,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.8950705321197276,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 1.1832,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.8249629348907459,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 1.1365,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.8558753794538538,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 1.2335,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.8764834607191381,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 1.0876,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.8968033214264838,
+      "learning_rate": 0.000106475648471337,
+      "loss": 1.2158,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.8854900296778185,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 1.1501,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.9247912025755864,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 1.2254,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.8296803216440574,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 1.1098,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.8445302120263739,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 1.175,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.8526662821322857,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 1.2137,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.8573450684787427,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 1.1198,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.9704132936461313,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 1.2565,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.9094218556866875,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 1.2282,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.7589102419386903,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 1.0726,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.8568552824329363,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 1.2225,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.9099573473568132,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 1.2192,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.8663868216092737,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 1.1698,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.9012087460682382,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 1.1963,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.8542139256758909,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 1.157,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.7399476383108098,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 1.0874,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.890064848208203,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 1.2032,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.8900187384119745,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 1.1205,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.8774863747578419,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 1.1616,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.8024417773551542,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 1.0815,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.8365325260082347,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 1.1137,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.7979893484558706,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 1.092,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.8700261231040217,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 1.2301,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.8760299325080985,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 1.0199,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.8407610443896854,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 1.1225,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.9984053835752241,
+      "learning_rate": 0.0001,
+      "loss": 1.1345,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.8939606929722101,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 1.125,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.9033370249042589,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 1.201,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.787837536241076,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 1.1558,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.8355913231040456,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 1.1463,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.8169983302610252,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 1.1265,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7757282371794076,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 1.1897,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.7989135286082059,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 1.1708,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.9121822189834736,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 1.1805,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.82789990823343,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 1.1776,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.930793369669878,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 1.3292,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.8918823860727169,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 1.1502,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.8902864870451087,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 1.2186,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.8781025509894115,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 1.2224,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.8121841814461802,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 1.2021,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.7886764492758879,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 1.0356,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.7686029066214254,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 1.0723,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.8703794048947431,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 1.075,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.8989762785964018,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 1.1801,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.8864219915448953,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 1.1061,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.8086124406622477,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 1.1339,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.8818612647451756,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 1.2096,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.9279341204076482,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 1.0718,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.8212583197585621,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 1.1684,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.9224561529108475,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 1.0358,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 1.0488626132449583,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 1.1282,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.789184234952121,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 1.1243,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 1.1532029459770452,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 1.1105,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.8583297822926123,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 1.1302,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.8746865884006344,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 1.1228,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.8308540377755012,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 1.0653,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.9080365422518107,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 1.136,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.8244965498672249,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 1.1526,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.8778465204353496,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 1.191,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.899596371232968,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 1.2633,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.846027449549948,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 1.1045,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.875587666265761,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 1.1452,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.8182419596546981,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 1.0371,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.883399913373032,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 1.2016,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.8182133900424698,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 1.134,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.7773559052370546,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 1.1416,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.8151636516880102,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 1.0544,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.8119234078524113,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 1.1735,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.7939689877289976,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 1.1403,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.8701602064505436,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 1.2373,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.8750979269151778,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 1.1367,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.8368484768425418,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 1.0717,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.7722128748638069,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 1.0568,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.8650625917734309,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 1.0874,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.868758473046753,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 1.1982,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.7333969388904342,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 1.0735,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.9342906386330464,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 1.1111,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.9329161005373878,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 1.1205,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.9165208035013336,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 1.1113,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.8801790291991601,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 1.0137,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.9601549553046083,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 1.1841,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.8379558504679411,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 1.1055,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.9093649112724251,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 1.2555,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.7687625068151095,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.9919,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.7714325328233042,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 1.1593,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.8718743778557958,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 1.2377,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.861892519002274,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 1.1547,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.8471051334080404,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 1.2204,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.7800037231529334,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 1.1018,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.8564345124089149,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 1.1518,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.9264361623337496,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 1.0757,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.8712899167429417,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 1.1096,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.7795794260093907,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 1.0942,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.8750109638854078,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 1.0867,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.7642708387310234,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 1.1202,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.8697644696800666,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 1.1798,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.7282315195593572,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.9475,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.8771028965759182,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 1.0596,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.8276268545588975,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 1.0337,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.9149360167307691,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 1.2154,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.8458772152757161,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 1.1974,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.9427927985836158,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 1.2551,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.8554309991838364,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 1.0821,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.810579604922163,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 1.0999,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.8458766496688266,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 1.194,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 1.041488539269024,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 1.0821,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.0716270770436451,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 1.0159,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.7889165860745851,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 1.0456,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.8041429168861712,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 1.1367,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.8154198808415671,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 1.1029,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.8118014580044097,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 1.0941,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.792146530511344,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 1.12,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.8868178855674158,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 1.1412,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.8017281660103419,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 1.0368,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.8000691024627,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 1.0888,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.7451097291353643,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 1.0572,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.8729647871873557,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 1.1053,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.8788176333054569,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 1.0999,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.7532952898180028,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 1.0621,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.8344408583571611,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 1.1367,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.7920384181717828,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 1.0681,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.8010517078779921,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 1.1865,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.8010838919456496,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 1.1533,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.933129482023698,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 1.1516,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.8468459182650288,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 1.1326,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.9162291392781863,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 1.1588,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.7736452217785802,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.9918,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.8909484010627318,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 1.0871,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.830245407983687,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 1.1392,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.7938151018111826,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 1.0767,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.8050749300928102,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 1.0704,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.8422094038655485,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 1.0566,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.8125604137667185,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 1.077,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.8215269206004515,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 1.104,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 1.080505997510848,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 1.2943,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.8299123958305165,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 1.035,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.8292512699924797,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 1.0974,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.9041423336155675,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 1.2077,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.8351246439627844,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 1.2475,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.7916680319988713,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 1.1293,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.8148732530602424,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 1.0803,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.8352478784169337,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 1.1248,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.848558267850434,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 1.1294,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.886575182677277,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 1.196,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.8400239260638604,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 1.2257,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.716802554303067,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 1.083,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.9234326795779665,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 1.0735,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.8945019976496457,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 1.1687,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.8081601399019864,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 1.1061,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.768131579857086,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 1.0402,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.9024593058339698,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 1.1213,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.7386998610259434,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 1.0713,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.7213578892564265,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 1.0458,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.8547907731081595,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 1.0748,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.8885372244769414,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 1.1719,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.759216817677016,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 1.2117,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.8640646688740952,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 1.015,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.9505908649729419,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 1.194,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.8414799777767897,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 1.1219,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.7801317291815592,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 1.0187,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.8597928560121036,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 1.1172,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.7722221121410255,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.9957,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.9123633740913779,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 1.2027,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.8121400103829539,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 1.0574,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.9259710217409359,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 1.1166,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.8962231282015487,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 1.2077,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.7736903065729944,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 1.1006,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.905468806571558,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 1.1467,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.8409825776664667,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 1.08,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.8617982497771295,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 1.1161,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.9258322580589109,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 1.1235,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.9526781011791231,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 1.1493,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.7781631789182217,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 1.0735,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.7092921491242238,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 1.0933,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.7889253303946118,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 1.0977,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.9442231893690635,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 1.095,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.9488461614190968,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 1.2477,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.7714464171885789,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 1.145,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.8367332889082589,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 1.0824,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.755752037576069,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 1.0332,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.8250639067334734,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 1.0402,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.7435904683490475,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 1.0871,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.869656041067551,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 1.1569,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.8342729387941089,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 1.1291,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.7476414619232458,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.9765,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.7805454973026643,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 1.1149,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.7940158861016194,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 1.1148,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.816238837296542,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 1.1721,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.8612039632403123,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 1.1537,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.8704542256188401,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 1.1211,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.8410500526207699,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 1.0405,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.8437521301512859,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 1.0505,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.7970168742278224,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 1.1339,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.8056564142638072,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.9961,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.8225667991518454,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 1.0822,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.8054113821507712,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 1.0873,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.7807098345586613,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.9968,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.9033512475643545,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 1.1096,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.8316099430383033,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 1.05,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.8364325293651905,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 1.1192,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.8387897132078757,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 1.126,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.7943181258355784,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 1.0905,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.9194687690867345,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 1.1612,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.7659556619523961,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.9791,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.8211925068553112,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 1.0956,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.7689878142328473,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.9637,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.9475286657707129,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 1.1987,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.8798707840647109,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 1.1092,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.9678328660495827,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 1.2474,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.9030882302098774,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.9859,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.8324833034351167,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 1.0637,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.8856226930234121,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 1.0574,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.8589605102834983,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 1.1582,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.8788357210288834,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 1.03,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.930444408144179,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 1.083,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.8435587500060738,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 1.0741,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.787078737860185,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 1.1415,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.9096181402514936,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 1.0613,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.7180597736990848,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 1.0081,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.8183737596107125,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 1.0737,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.7523623613392743,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 1.0959,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.8285998941738993,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 1.0251,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.8296654676630226,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 1.1092,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.7946138376139481,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 1.0882,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.862786168494372,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.978,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.7851555341840509,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 1.0435,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.843942554035457,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 1.0004,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.9082315582270839,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 1.1232,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.8716824703464001,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 1.0487,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.8594414287342595,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 1.1496,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.9502004091668262,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 1.1618,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8516762802902843,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 1.0397,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.8166207305907555,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 1.1019,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.9482880117564721,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 1.0356,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.7258036421019421,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 1.0178,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.872814346090495,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 1.1421,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.8458393993914094,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 1.0526,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.910122283278521,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 1.1135,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.7783447548656339,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 1.0034,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.7694625021252215,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 1.1224,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.7789699524240866,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 1.0577,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.7966320829391377,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 1.0705,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.7955706972132834,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 1.0405,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 1.0213874747735756,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 1.0886,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.8363724959442057,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.9931,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.7917579494315584,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 1.127,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.832130475395553,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 1.1778,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.8846275284126474,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 1.1274,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.82177266481349,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 1.1013,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.9400441406424015,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 1.1665,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.8310350112225171,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 1.243,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.952079058372432,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 1.0113,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.8088607074314236,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 1.0944,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.828801567930494,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 1.1089,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.9020366227162672,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 1.1116,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.9277789986108305,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 1.1,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8626346678476595,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 1.0016,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.7966414003601904,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.9873,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.8036041116068522,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 1.0645,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.8720820634045482,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 1.09,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.875647722790824,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 1.1393,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.7897158665335176,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 1.1671,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.8279757558420157,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 1.0473,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.7553070530855709,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 1.0388,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.7409780701505099,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 1.0441,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.8813018133474086,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 1.1323,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.8916739966164384,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 1.1638,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.725246831952099,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 1.0564,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.8551037087395035,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 1.0942,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.7667994554139557,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 1.0806,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.7787508174187104,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 1.097,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.7269911908663984,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 1.0385,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.8178400476588422,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 1.0532,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.7338076719326018,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 1.1199,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.9600653046668537,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 1.0884,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.9693259464203958,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 1.121,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.8971313534056323,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 1.1076,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.8505547848799991,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 1.1581,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.901973898717887,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 1.114,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.7732708517581511,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.9983,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.8376010427650987,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 1.1088,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8869780495184847,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 1.1958,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.7383696992151155,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.94,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 1.008957136472109,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 1.1233,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.7561674578801395,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 1.0239,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.7993430654708011,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 1.0965,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.9005324807571452,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 1.1309,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.7688379670981546,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 1.0644,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.755342086444707,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 1.0308,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.9116587787303737,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 1.1085,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.8840048457007753,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 1.0365,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.88369305991926,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 1.113,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.8914593348589701,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 1.0857,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.7202666269984003,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.9599,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.8609160051639316,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 1.0807,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.875315044369369,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 1.0459,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.8046586030392038,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.9892,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.8845577469864819,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 1.0744,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.81407299976784,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 1.1137,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.8719550109634547,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 1.0937,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.8078469055176236,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 1.0418,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.8635161003549958,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 1.116,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.8635724625565849,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 1.0166,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.7958853087898546,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 1.0631,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.8541261044016341,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 1.1747,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.7611045541610414,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.9826,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.8166688214584858,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 1.155,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.8729492023257425,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.9985,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.7338889411154411,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.9764,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.7841742476721919,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 1.0407,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.8190648808105729,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 1.0059,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 1.189971966755002,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 1.2784,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.7747620673874669,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 1.0338,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.8459819458124576,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 1.0603,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.7299775773284684,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.9589,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.873153295737133,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 1.0276,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.7399229179029378,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.9942,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.8081689802013524,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.9782,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.8205043844461317,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 1.1347,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.9300242973554271,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 1.1063,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.7955739825725661,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.9989,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.8369237179848538,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 1.0653,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.8877315005007081,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 1.139,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.9063518978090884,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 1.1666,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.8402301995330217,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 1.0871,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.7941155124027731,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 1.0616,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.8086180722645121,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 1.0831,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.7301695836136674,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 1.0564,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.8824825301242152,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 1.0591,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.7608663915405037,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 1.1637,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.9562058298891669,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 1.0107,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.8353655172198402,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 1.0556,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.9446711176411489,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 1.0598,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.8373112011041507,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 1.0998,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.8393140098410127,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 1.0959,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 1.0007850712037603,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 1.0852,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.9547238570159858,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 1.1677,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.7670799358518546,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 1.0491,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.8069720746952094,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 1.0403,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.8323483451268169,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 1.1721,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.8665610531801702,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 1.0433,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.820120496167094,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 1.0389,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.8668907325122193,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 1.0486,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.9215499864182681,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 1.2304,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.8575033778345631,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 1.1016,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.7885383565624948,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 1.1444,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.9616420533689475,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 1.0858,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.8000352388587801,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 1.0613,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.9432878918791088,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 1.083,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.8581930522375921,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 1.0763,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.8477439535159955,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 1.0084,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.8954126142945501,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 1.1148,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.8451424029281424,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 1.0159,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.7687739108451231,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 1.0414,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.8232387388107232,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 1.0491,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.8670984144963605,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 1.045,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.9207845432159518,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.9877,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 1.0445500951448323,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 1.1575,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.8525396222453334,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 1.0176,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.847576403477811,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 1.1012,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.8689298591511133,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 1.138,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.8140926501993877,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 1.0843,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.7594070690792206,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.9893,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.7554074010238546,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 1.0272,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.8250669999383188,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 1.0714,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.8301229569066313,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 1.0756,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.8601687215822483,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 1.0887,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.8506451142903321,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 1.1066,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.8198327681216107,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.9676,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.9012566805483494,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 1.0499,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.7889708330455324,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 1.1374,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.7479780383891586,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 1.1091,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.8085409919807247,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 1.0892,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.8669689768248773,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 1.0941,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.7680590513678617,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 1.1078,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.8266768713093456,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 1.021,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.8507316837188268,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 1.0557,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.7259233668247581,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.9342,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.9510904005306071,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 1.0499,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.8679214790847098,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 1.0864,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.8388281221981571,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 1.1251,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7967302882328616,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 1.0881,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.8788941746699341,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 1.0843,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.8310993678093996,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 1.1267,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.7422918123056207,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.9686,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.8580935822649455,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 1.1098,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.9062482236530666,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 1.1375,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.8078159675834186,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 1.0216,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.8974067691172537,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 1.1616,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.7601753406737961,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 1.0938,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.833755893544062,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 1.0737,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.8417856272896627,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 1.0216,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.7192037601325971,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.9657,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.8325532445569533,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 1.1143,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.8143603763208344,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 1.0981,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.8315570305000287,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 1.0619,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.9431012095409678,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 1.0707,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 1.0858550918279,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 1.1165,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.8236739587451712,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 1.0156,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.797923169332508,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 1.0358,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.9566963752166892,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 1.1489,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.7864660429156809,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 1.0898,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.7712612996367878,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.9535,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.7646883255608178,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 1.1222,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.8401577710670294,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 1.2397,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.8497089754801997,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.9819,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.7901100406395174,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.987,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.7882408403718572,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 1.0182,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.7727435639026202,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.9139,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.9867700550888403,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 1.1207,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.8901041929563709,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 1.0642,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.8830215405712524,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 1.0436,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.7917962858116705,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.8976,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.8433163117805252,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 1.0382,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.7716275597028938,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.9649,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.8069655979220444,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 1.0419,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.8930329120684376,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 1.1139,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.7984848868984005,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.9043,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.8793418989596367,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.9909,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.843670675814957,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 1.0708,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.8266625193427682,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.9907,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.8455368846406135,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 1.0169,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.8443310120368719,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.9915,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.8460089788670055,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 1.135,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.9201811307817429,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 1.1082,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.8339010425588553,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 1.0462,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.7982008593372306,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 1.0188,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.8085660300902544,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.9638,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.8227448081758223,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 1.1124,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.7519255247523404,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.9756,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.7723108828050917,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 1.0099,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.848142675290011,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 1.061,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.8548972885408882,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 1.0671,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.8339445160342264,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.9893,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.8766689327185713,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 1.1564,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.7853075453523916,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 1.0244,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.8570147208703542,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 1.0484,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.8136814107398292,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 1.0179,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.769578688484213,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 1.0155,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.7643325804101265,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 1.0514,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.8210295279927137,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 1.0158,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.8542721125276475,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 1.0017,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.7785321602297618,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 1.077,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.7314186123390642,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 1.0268,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.8435403090839856,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 1.096,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.8724304024662537,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 1.0366,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.7369835686536529,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 1.0144,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.8181065216676432,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 1.0245,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.8815239530451991,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 1.1473,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.8032536779947032,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 1.0423,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.7131358007552208,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 1.0076,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.8561235966926187,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.9958,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.8337193626782963,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 1.0355,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 1.0554434463308862,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 1.0646,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.8834137892193425,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 1.1266,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.8081061581358774,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.9963,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.7790424195012522,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.9875,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.9447272471765829,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 1.18,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.7520148284219924,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.9588,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.8934743042970116,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 1.086,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.8616689862663275,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 1.0719,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.7922685409060988,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.9262,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.7694082262129455,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.9479,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.8757202664560848,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 1.0836,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.8801430802878017,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 1.208,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.8873892163530365,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 1.005,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.7935533814333376,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 1.0279,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.8438568284727476,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 1.0458,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.9029979799434774,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 1.1174,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.854865680023673,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 1.0298,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.9227965088124007,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 1.1905,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.7845733244188352,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 1.0777,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.9076594402743925,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 1.1409,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.8343244746160255,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 1.0167,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.7413090833309477,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 1.0595,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.7810378713617924,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 1.0228,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.8818267886496703,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 1.0787,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.9149056583819745,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 1.1618,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.7690539234652768,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.9936,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.8681153381027179,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.9168,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.7567542671180796,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 1.0454,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7510789582140142,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.9934,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.7077461773353635,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.9656,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.8122835526864932,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 1.0685,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.8701275544447438,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 1.1241,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.8299484468653472,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 1.0042,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.7762025442493229,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.9914,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.8600018089658686,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 1.1116,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.7916195175227821,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 1.0024,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.7975427193955309,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 1.0999,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.7203807251928327,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.9853,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.7442501433407781,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.957,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.9649373646508272,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 1.1912,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.8221262344919317,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 1.0573,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.8817730038905364,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 1.0231,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.7588643695285561,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.9955,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.7927470871468772,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.9565,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.8365527062220788,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.9795,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.882373043265085,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 1.0722,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.8038726568288451,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.9725,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.6968209369455347,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.9518,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.8053464432992384,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 1.1971,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.9487223791910389,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 1.143,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.8269540682182868,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 1.0476,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.8959769043876853,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 1.0322,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.8482885755213688,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 1.0814,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.8430964156606374,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 1.1773,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 1.0766306814995699,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 1.1936,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.8555670658867724,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 1.0445,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.7803847535248349,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 1.1091,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.8083442794644436,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 1.1041,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.7675666250884017,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.9808,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.8455159865291129,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 1.0783,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.8961068833586694,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 1.0994,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 1.4506444620216155,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 1.1413,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.770061682050551,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 1.0944,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.7891790037353675,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 1.0574,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.9119231007900593,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 1.0913,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.7950096502373967,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 1.103,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.8791042132117247,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 1.1572,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.7628036129054035,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 1.0829,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.8791188158176501,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 1.0875,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.7523148247789707,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 1.044,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.8113682212901249,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.9732,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.8147464631743097,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 1.1032,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.886241187198881,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 1.1017,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.8386506464729159,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 1.0944,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.7268472637888334,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.9468,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.9525044757276668,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 1.0831,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.8053724140197336,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 1.0737,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.8516042807221436,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 1.1573,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.852134568484373,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 1.0343,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.6990479370212819,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.8985,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.7983754867840562,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 1.0917,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.9274473548553837,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 1.0352,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.8188215523931848,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 1.0683,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.8538581769741597,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 1.0381,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 1.0069134002810485,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 1.079,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.8075925733220816,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 1.0509,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.921754289470793,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 1.1385,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.9552746253102438,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 1.11,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.8481665514412091,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 1.0993,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.8850663754196179,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 1.0511,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.8423309250860148,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 1.0887,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.9086622994893373,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 1.0991,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.7743885704537332,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 1.0021,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.826099928526726,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 1.117,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.8561322325727061,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 1.0156,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.8185299016872479,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 1.0752,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.7788460498099669,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 1.0119,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.8686029111410445,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 1.0305,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.8330074771412815,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.9842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.7930170644993012,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 1.0737,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.8665021364309282,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 1.1391,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 1.0265081407582635,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 1.1065,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.8324763969279934,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 1.1389,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.7998666682996705,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 1.0472,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.8810888210880106,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 1.0541,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.9403217037232425,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 1.0467,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.8300390467407558,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 1.0533,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.9254647235536025,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 1.0816,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.7778474839752096,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 1.0773,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.8003783399982549,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 1.007,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.8812466491710779,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 1.16,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.8326312999003161,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 1.0965,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.7563118996771208,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 1.0676,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 1.036415448691471,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 1.0978,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.8486359609408237,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 1.0701,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.7640372589074247,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 1.0699,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.8004358391415526,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.9778,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.8674774175052101,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 1.0817,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.7729028526574891,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.9437,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.9783238007096406,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 1.1378,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.8522896936237008,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 1.058,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.9505815733076248,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 1.093,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.8219084334556093,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 1.1247,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.7582367787081001,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 1.0931,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.8275786084042629,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 1.0781,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.7825227563832609,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 1.1064,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.9396717094080378,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 1.1058,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.8190377523928112,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 1.1167,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.874112847760409,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 1.0229,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.8033519658511188,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 1.0412,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.8697321168989555,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 1.0862,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.8482848418241187,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 1.0522,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.8067862128846277,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 1.0957,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.7349051364302464,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 1.008,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.8062767237675197,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 1.0228,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.8043382593470939,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 1.0865,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.777121153566047,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 1.0193,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.7720730274857394,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.9782,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.8932139873483963,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 1.1671,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.8094519972185115,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.9897,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.8207848408700373,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 1.1209,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.9064750065862063,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 1.0058,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.8136539983287625,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 1.0715,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.8037367517607537,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 1.066,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.8251132367630437,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.9979,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.8110527503825463,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 1.1374,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.8824191340178144,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 1.084,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.9784790378200482,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 1.112,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.8108840654344787,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 1.0889,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.8600919844190589,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 1.2295,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.9574092537518714,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 1.1282,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.8059799363006932,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 1.088,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.8699350493645416,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 1.0753,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.8706707491571822,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 1.1125,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.9403287413231343,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 1.0149,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.7965770509441772,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 1.0018,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.7904600168542316,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 1.0288,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.9023844501222255,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 1.1748,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.7628378514295081,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 1.0356,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.8420363563080878,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 1.082,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.7698524661911217,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.9641,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.7667599742364795,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 1.0008,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.8424245996095795,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 1.0517,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.8510353343099226,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 1.0016,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.7616975654904604,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.9691,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.8107876525232283,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 1.026,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.7369625978002111,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.9834,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.8910867360427009,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 1.0972,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.748674014255129,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 1.0393,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.8615963370022487,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 1.094,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.875903473799594,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.985,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.8894533001501406,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 1.056,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.8871935662849862,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 1.0659,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.8146409426801687,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 1.1013,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 1.0705442277942085,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 1.0065,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.8415335059819268,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 1.0892,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.8828933279031737,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 1.0625,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.8413043992024973,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 1.1092,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.8051131433252608,
+      "learning_rate": 0.0,
+      "loss": 1.133,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 409299338477568.0,
+      "train_loss": 1.1655625699996948,
+      "train_runtime": 12525.0909,
+      "train_samples_per_second": 1.597,
+      "train_steps_per_second": 0.1
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 409299338477568.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e91269c9eae3be89407a4883df0cbd727c017e3
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "v_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..85bb8abdb020ac999ec504b7bfe4db902f9d5c55
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bda2d6cde95036419c93a716829b0284b15f940254364ee7fd021c9b07db115
+size 671150064
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c9888b056674e28fe0152b1527b8d8da571fd4a8
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d195785765d811acd791e9e869eed8b7cf69e0452dfdef7c9103f061a654e123
+size 918507402
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c3fbbc409ece9900a3cbf9952db36cebf3ee95a
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 2.4375962767612345,
+      "learning_rate": 5e-05,
+      "loss": 2.0055,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.8205678181133527,
+      "learning_rate": 0.0001,
+      "loss": 1.8155,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 1.1652809543443925,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.4824,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.5201152697502844,
+      "learning_rate": 0.0002,
+      "loss": 1.5361,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.3090552478809887,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.6664,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.031628287092597,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.4614,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 1.0202753941947118,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.5362,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.058916686800406,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 1.4049,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 1.209947453785025,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 1.4271,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.136029805718251,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 1.397,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 1.1940378677759405,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 1.4353,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 1.0763620564079805,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 1.27,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 1.0186155436117859,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 1.504,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.9058596122818576,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 1.2254,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.8646082804367832,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 1.2529,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.9007250526249097,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 1.3052,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 1.0101800105447813,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 1.41,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.9435744648080255,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 1.3445,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.9424757697182853,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 1.3374,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9235794265909824,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 1.35,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.9344657343675837,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 1.3005,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.1400402352729915,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 1.4263,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 1.0028077404329654,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 1.3401,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.8953385872918737,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 1.1645,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.975688053138917,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 1.4462,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.9022176701016223,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 1.3582,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 1.0273974913918964,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 1.4179,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 1.0245918857785254,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 1.3315,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.934195162405148,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 1.3547,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.927115496292952,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 1.3661,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.8748689090750555,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 1.3008,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.962208141701793,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 1.3561,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 1.0069280840139563,
+      "learning_rate": 0.000172967916579403,
+      "loss": 1.3655,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.9243569825094787,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 1.3448,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8951086932705498,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 1.2588,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.8622664662409053,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 1.3533,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.9293730578720981,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 1.2414,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 1.0469127574163346,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 1.5204,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.8820185423482838,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 1.1172,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.9122615718472,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 1.3977,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.8510033976335003,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 1.3419,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.9821223857398642,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 1.3981,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.9144643080831125,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 1.4276,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.8941879417566212,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 1.4579,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.9058529475179548,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 1.3782,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.9272855454987454,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 1.3791,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.8332099088043601,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 1.3699,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.8364845255823447,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 1.3083,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.9560879208789635,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 1.3531,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.8415371480344829,
+      "learning_rate": 0.000136764169663272,
+      "loss": 1.3675,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.853424371015124,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 1.3429,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.9625578075844118,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 1.2927,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.8482472192606121,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 1.2812,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.8476053067734847,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 1.34,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.8527526959812046,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 1.343,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.9141929930011206,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 1.3156,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.9463779538926128,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 1.3841,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.944185912368796,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 1.3545,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.8369356187298539,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 1.2654,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9066908409453514,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 1.3662,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 2.271347741753355,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 1.2857,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.802300358511912,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 1.249,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.8234655930884818,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 1.229,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 1.0623250114905893,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 1.3819,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.8416321789047925,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 1.3624,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 1.0433051505010305,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 1.438,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.8373819372038648,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 1.2292,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.8156628394687041,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 1.2318,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 1.0744275187211567,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 1.3872,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.8259842891973573,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 1.2973,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.8889081422789769,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 1.3242,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.8768836198798882,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 1.1688,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.8643228297558432,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 1.1996,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.870246487198974,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 1.405,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.9310147988449106,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 1.2846,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.8547840042469805,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 1.2263,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.7826078161467719,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 1.1951,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.9003174084433649,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 1.3574,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.7798062968098542,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 1.2386,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.9384176057411918,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 1.2428,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.8138110165553195,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 1.2728,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.7562590090488508,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 1.1451,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.7636838854961598,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 1.2535,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.7869922876906755,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 1.2666,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.884478858985538,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 1.2799,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.8109606603905511,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 1.2603,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.8313160829561579,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 1.3399,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.7924509618604803,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 1.2432,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.9108193579559817,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 1.2918,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.9276660976162258,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 1.3278,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.7361813611911072,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 1.3238,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.885325101404829,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 1.2137,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.8879103772088985,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 1.3551,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.9449620463795166,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 1.3261,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.9964457474038876,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 1.3729,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.9418918840580128,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 1.3541,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.9080986687610889,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 1.2559,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.851676767567751,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 1.2089,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.7872410177082587,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 1.145,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9353780847877511,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 1.2498,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.8719766191624784,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 1.2702,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.7950396247078377,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 1.242,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.8416961370911875,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 1.3646,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.8572495478701838,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 1.2794,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.8521127011891462,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 1.2827,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.8252536924805987,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 1.2652,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.9154015891631243,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 1.2429,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.878794667923896,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 1.2176,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.875395034576249,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 1.3073,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.8745853976117788,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 1.2886,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.9830700358548075,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 1.28,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.8858764862186183,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 1.2357,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.7998919971474687,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 1.2607,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.7384683443866129,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 1.0985,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.811443115370641,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 1.261,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.7461134302323397,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 1.2753,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.7927512730358989,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 1.2605,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.9092973079007383,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 1.2804,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.9920121022665794,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 1.4275,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.7497931860541336,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 1.1458,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.8672561201546914,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 1.2127,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.8334654251902224,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 1.2524,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.9220176752178163,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 1.131,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.8476351152901493,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 1.2145,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.9123565175137601,
+      "learning_rate": 0.0,
+      "loss": 1.3582,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 40355963437056.0,
+      "train_loss": 1.325367919921875,
+      "train_runtime": 1253.4357,
+      "train_samples_per_second": 1.596,
+      "train_steps_per_second": 0.1
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 40355963437056.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9765a3e78557b6c79bb6f5bca0a034d0388a5ead
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..076ae57dc89e00607e846bda943126160d5eac2d
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09e6207f9d9fb334a33ff935bf554ab20ceaa9634495a6f768138cc0247bc27b
+size 671150064
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b64710ee6ba62807d72285bf354c2994b5d35269
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b00d344ddcecd1782f30de60de7522e44338bd9bd5fd68b665c5e9a298d3e42f
+size 918507402
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1346602eead4ea9ebf5dd9db29201bdf05c576e2
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/trainer_state.json
@@ -0,0 +1,1792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 2.454786897231832,
+      "learning_rate": 2.5e-05,
+      "loss": 2.0055,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.842436047635649,
+      "learning_rate": 5e-05,
+      "loss": 1.8155,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 1.4608343313778398,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 1.5465,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.04404253663071,
+      "learning_rate": 0.0001,
+      "loss": 1.4925,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.4833433883239422,
+      "learning_rate": 0.000125,
+      "loss": 1.6817,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.3463159602018167,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.5435,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 1.2565394186018986,
+      "learning_rate": 0.000175,
+      "loss": 1.5802,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.0992906450167048,
+      "learning_rate": 0.0002,
+      "loss": 1.4388,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.9800827433283528,
+      "learning_rate": 0.0001999915737775817,
+      "loss": 1.4302,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.0377034629839987,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.3866,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 1.1873953407603806,
+      "learning_rate": 0.00019992417251814282,
+      "loss": 1.4402,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 1.0901190708201218,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.2677,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 1.0525552249283692,
+      "learning_rate": 0.0001997894154323911,
+      "loss": 1.4872,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.9983966962380565,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.2229,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.9391956274259196,
+      "learning_rate": 0.0001995873933559535,
+      "loss": 1.2639,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.9298725207140135,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 1.3107,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 1.0109608082604975,
+      "learning_rate": 0.0001993182424657285,
+      "loss": 1.4046,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.9783080720697636,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 1.3579,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.9069991356656613,
+      "learning_rate": 0.0001989821441880933,
+      "loss": 1.3355,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9895451465879731,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 1.3439,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.986267073744202,
+      "learning_rate": 0.0001985793250766098,
+      "loss": 1.3042,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.1534014257751493,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 1.4238,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 1.0255571791577869,
+      "learning_rate": 0.00019811005665931205,
+      "loss": 1.3415,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.9274064718692961,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 1.1667,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.0859074770833808,
+      "learning_rate": 0.0001975746552556772,
+      "loss": 1.4558,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.8987327600865704,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 1.3593,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.9315367888961246,
+      "learning_rate": 0.0001969734817634044,
+      "loss": 1.424,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 1.0397623290053568,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 1.3356,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.9239779767358856,
+      "learning_rate": 0.00019630694141514464,
+      "loss": 1.3457,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.9256155259019234,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 1.3682,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.91453564542336,
+      "learning_rate": 0.0001955754835053459,
+      "loss": 1.2999,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.9322616402712055,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 1.3528,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 1.0110738671839277,
+      "learning_rate": 0.0001947796010873974,
+      "loss": 1.3682,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.9209930367156439,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 1.3418,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.9131101646709927,
+      "learning_rate": 0.0001939198306412775,
+      "loss": 1.2602,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.8886169513541825,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 1.359,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.9280189508217098,
+      "learning_rate": 0.0001929967517119289,
+      "loss": 1.2432,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 1.1198070932319397,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 1.5276,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.9003335984159968,
+      "learning_rate": 0.0001920109865186052,
+      "loss": 1.1269,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.9797642104699517,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 1.4041,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.8417683479387569,
+      "learning_rate": 0.00019096319953545185,
+      "loss": 1.3465,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.9818092313272447,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 1.4078,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.9083765370663439,
+      "learning_rate": 0.00018985409704360456,
+      "loss": 1.4321,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.8941496427189838,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 1.4587,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8839595066828793,
+      "learning_rate": 0.00018868442665510678,
+      "loss": 1.3793,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.9549893377990281,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 1.391,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.882414820292664,
+      "learning_rate": 0.00018745497680896722,
+      "loss": 1.3803,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.887759800655994,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 1.3198,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.9492260628073359,
+      "learning_rate": 0.0001861665762396974,
+      "loss": 1.3614,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.8762951830541706,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 1.3855,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.8915934670890578,
+      "learning_rate": 0.00018482009341868697,
+      "loss": 1.3612,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 1.0115455917430354,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 1.3162,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.8741912749537363,
+      "learning_rate": 0.00018341643596879367,
+      "loss": 1.2894,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.8441946856123005,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 1.3515,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.8809778516688372,
+      "learning_rate": 0.00018195655005254273,
+      "loss": 1.3595,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.9433398546492309,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 1.3354,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.964971265725767,
+      "learning_rate": 0.00018044141973434758,
+      "loss": 1.394,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 1.0418275518352318,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 1.3663,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.8516899579807444,
+      "learning_rate": 0.00017887206631718203,
+      "loss": 1.2768,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8962604414615124,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 1.3755,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 1.0298256962484944,
+      "learning_rate": 0.00017724954765415137,
+      "loss": 1.3205,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.8289660154098288,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 1.2592,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.9298213657742931,
+      "learning_rate": 0.00017557495743542585,
+      "loss": 1.2407,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 1.106967597246562,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 1.4217,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.8814103122645993,
+      "learning_rate": 0.00017384942445101772,
+      "loss": 1.3808,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 1.1443583040123078,
+      "learning_rate": 0.000172967916579403,
+      "loss": 1.453,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.8711492300529738,
+      "learning_rate": 0.00017207411182989832,
+      "loss": 1.2567,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.8436613350439512,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 1.2643,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 1.0512295898273085,
+      "learning_rate": 0.00017025021625596853,
+      "loss": 1.4127,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.8602871887637837,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 1.3148,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.9067935512993148,
+      "learning_rate": 0.0001683789671614107,
+      "loss": 1.35,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.8788883808272243,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 1.1869,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.9460567866998142,
+      "learning_rate": 0.00016646162589796615,
+      "loss": 1.2199,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 1.0036077278420452,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 1.4394,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.9452753464036726,
+      "learning_rate": 0.00016449948488669639,
+      "loss": 1.2931,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.8233442283123868,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 1.2427,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.8615498620797277,
+      "learning_rate": 0.00016249386674680184,
+      "loss": 1.2248,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 1.0537862842321968,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 1.4126,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.8605446318443883,
+      "learning_rate": 0.00016044612340408466,
+      "loss": 1.2592,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.8682553736624375,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 1.2749,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.8611624544092797,
+      "learning_rate": 0.00015835763517965673,
+      "loss": 1.3008,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.7778043968372704,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 1.159,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.8267947712581017,
+      "learning_rate": 0.0001562298098595078,
+      "loss": 1.2829,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.8619985794810339,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 1.3103,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.9595046423703979,
+      "learning_rate": 0.00015406408174555976,
+      "loss": 1.3134,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.8074353076035666,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 1.2772,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.8372427936508989,
+      "learning_rate": 0.00015186191068884775,
+      "loss": 1.3604,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.8332005936689508,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 1.2626,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.9439628299152224,
+      "learning_rate": 0.00014962478110547918,
+      "loss": 1.3441,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.996012432825993,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 1.3717,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.8258261527226268,
+      "learning_rate": 0.0001473542009760343,
+      "loss": 1.3518,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.8778655106027685,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 1.239,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.9540956569028518,
+      "learning_rate": 0.0001450517008290827,
+      "loss": 1.3892,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.9759804751023198,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 1.355,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.9465521978636429,
+      "learning_rate": 0.00014271883270950073,
+      "loss": 1.414,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.9737987698753232,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 1.3891,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.9099939493752546,
+      "learning_rate": 0.00014035716913228568,
+      "loss": 1.2908,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.9059674667292016,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 1.2388,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.7761495052755369,
+      "learning_rate": 0.0001379683020225714,
+      "loss": 1.1604,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9480743996261369,
+      "learning_rate": 0.000136764169663272,
+      "loss": 1.2747,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.9348510892211489,
+      "learning_rate": 0.00013555384164256048,
+      "loss": 1.315,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.8759026106815115,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 1.261,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.8988522453335569,
+      "learning_rate": 0.00013311541550609565,
+      "loss": 1.3952,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 1.0173472319077939,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 1.3036,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.9741728528764557,
+      "learning_rate": 0.00013065466728160252,
+      "loss": 1.3275,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.8999294894193648,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 1.2809,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.90520417051285,
+      "learning_rate": 0.00012817325568414297,
+      "loss": 1.2641,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.8890234119872065,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 1.2489,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.8474743399902241,
+      "learning_rate": 0.00012567285335732633,
+      "loss": 1.3252,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.8658741276450808,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 1.3124,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.8981624669190241,
+      "learning_rate": 0.00012315514574583113,
+      "loss": 1.293,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.9264292061687182,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 1.2691,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.8632538887733768,
+      "learning_rate": 0.00012062182995929882,
+      "loss": 1.2964,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.7531324392533881,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 1.1218,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8372202446838725,
+      "learning_rate": 0.0001180746136283638,
+      "loss": 1.2861,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.7756688230347595,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 1.3032,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.8215704713405931,
+      "learning_rate": 0.00011551521375359206,
+      "loss": 1.2896,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.8939563107843507,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 1.2896,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 1.0694989586581007,
+      "learning_rate": 0.00011294535554810354,
+      "loss": 1.4404,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.7928451007447335,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 1.1585,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.9138057796186851,
+      "learning_rate": 0.00011036677127465889,
+      "loss": 1.2114,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.8812247071590515,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 1.2766,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.9140878350260794,
+      "learning_rate": 0.00010778119907799398,
+      "loss": 1.1352,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.8566159540621913,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 1.2235,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.9116304056158802,
+      "learning_rate": 0.00010519038181318999,
+      "loss": 1.3621,
+      "step": 125
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.7755679747113441,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.8344,
+      "step": 126
+    },
+    {
+      "epoch": 1.016,
+      "grad_norm": 0.6979054522941274,
+      "learning_rate": 0.00010259606587086783,
+      "loss": 0.7669,
+      "step": 127
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.7291268179512961,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.7925,
+      "step": 128
+    },
+    {
+      "epoch": 1.032,
+      "grad_norm": 0.8154205675076477,
+      "learning_rate": 0.0001,
+      "loss": 0.8207,
+      "step": 129
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.8030396568749524,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.8186,
+      "step": 130
+    },
+    {
+      "epoch": 1.048,
+      "grad_norm": 0.8770791344771406,
+      "learning_rate": 9.740393412913219e-05,
+      "loss": 0.8436,
+      "step": 131
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.8439224955672564,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.691,
+      "step": 132
+    },
+    {
+      "epoch": 1.064,
+      "grad_norm": 1.0048193558934282,
+      "learning_rate": 9.480961818681004e-05,
+      "loss": 0.8403,
+      "step": 133
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 1.0523542503386671,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.7973,
+      "step": 134
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.9002095632718905,
+      "learning_rate": 9.221880092200601e-05,
+      "loss": 0.7315,
+      "step": 135
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 1.0630429397417207,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.8238,
+      "step": 136
+    },
+    {
+      "epoch": 1.096,
+      "grad_norm": 0.9190093600470862,
+      "learning_rate": 8.963322872534114e-05,
+      "loss": 0.6638,
+      "step": 137
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 1.079431501244743,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.7734,
+      "step": 138
+    },
+    {
+      "epoch": 1.112,
+      "grad_norm": 0.9806969575217238,
+      "learning_rate": 8.705464445189647e-05,
+      "loss": 0.7376,
+      "step": 139
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.9299842546825281,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7011,
+      "step": 140
+    },
+    {
+      "epoch": 1.1280000000000001,
+      "grad_norm": 0.86875813110234,
+      "learning_rate": 8.448478624640797e-05,
+      "loss": 0.7586,
+      "step": 141
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.9244830667073946,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7808,
+      "step": 142
+    },
+    {
+      "epoch": 1.144,
+      "grad_norm": 0.8451369002160248,
+      "learning_rate": 8.192538637163621e-05,
+      "loss": 0.7508,
+      "step": 143
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.8870725287895037,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.7372,
+      "step": 144
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.9227181372026184,
+      "learning_rate": 7.93781700407012e-05,
+      "loss": 0.7934,
+      "step": 145
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 1.011467321825371,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.8319,
+      "step": 146
+    },
+    {
+      "epoch": 1.176,
+      "grad_norm": 1.019105698928754,
+      "learning_rate": 7.684485425416888e-05,
+      "loss": 0.7234,
+      "step": 147
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.8950069527342469,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.7635,
+      "step": 148
+    },
+    {
+      "epoch": 1.192,
+      "grad_norm": 0.9609590149702135,
+      "learning_rate": 7.432714664267373e-05,
+      "loss": 0.7848,
+      "step": 149
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.000983387982248,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.7037,
+      "step": 150
+    },
+    {
+      "epoch": 1.208,
+      "grad_norm": 0.9044957343906033,
+      "learning_rate": 7.182674431585704e-05,
+      "loss": 0.6679,
+      "step": 151
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.9191152811134384,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.7899,
+      "step": 152
+    },
+    {
+      "epoch": 1.224,
+      "grad_norm": 0.9035340124406496,
+      "learning_rate": 6.934533271839752e-05,
+      "loss": 0.7174,
+      "step": 153
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.9056583346175192,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7246,
+      "step": 154
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.9420824209712398,
+      "learning_rate": 6.688458449390437e-05,
+      "loss": 0.7473,
+      "step": 155
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.9877125280086316,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.6951,
+      "step": 156
+    },
+    {
+      "epoch": 1.256,
+      "grad_norm": 0.9362781914039232,
+      "learning_rate": 6.444615835743955e-05,
+      "loss": 0.7093,
+      "step": 157
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.9371151683347106,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.8175,
+      "step": 158
+    },
+    {
+      "epoch": 1.272,
+      "grad_norm": 1.0140898027379972,
+      "learning_rate": 6.203169797742861e-05,
+      "loss": 0.6958,
+      "step": 159
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.0998722652608337,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.7738,
+      "step": 160
+    },
+    {
+      "epoch": 1.288,
+      "grad_norm": 0.9450932338254235,
+      "learning_rate": 5.964283086771435e-05,
+      "loss": 0.7078,
+      "step": 161
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 1.0699624736401447,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.8035,
+      "step": 162
+    },
+    {
+      "epoch": 1.304,
+      "grad_norm": 1.742699415489815,
+      "learning_rate": 5.728116729049928e-05,
+      "loss": 0.707,
+      "step": 163
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.9281568756309962,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.7321,
+      "step": 164
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.9896754789509472,
+      "learning_rate": 5.4948299170917325e-05,
+      "loss": 0.773,
+      "step": 165
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.9613196439719053,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.739,
+      "step": 166
+    },
+    {
+      "epoch": 1.336,
+      "grad_norm": 0.8612910635418535,
+      "learning_rate": 5.26457990239657e-05,
+      "loss": 0.7113,
+      "step": 167
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.9934810738394599,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.776,
+      "step": 168
+    },
+    {
+      "epoch": 1.3519999999999999,
+      "grad_norm": 0.8947682005770344,
+      "learning_rate": 5.0375218894520834e-05,
+      "loss": 0.7245,
+      "step": 169
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.8593670171654829,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.734,
+      "step": 170
+    },
+    {
+      "epoch": 1.3679999999999999,
+      "grad_norm": 0.9551317826270233,
+      "learning_rate": 4.813808931115228e-05,
+      "loss": 0.7536,
+      "step": 171
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.9646754007650834,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.7289,
+      "step": 172
+    },
+    {
+      "epoch": 1.384,
+      "grad_norm": 0.9443811214884448,
+      "learning_rate": 4.593591825444028e-05,
+      "loss": 0.739,
+      "step": 173
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.8894525263479396,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.714,
+      "step": 174
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.0258290832265347,
+      "learning_rate": 4.377019014049223e-05,
+      "loss": 0.7807,
+      "step": 175
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.9773569061242865,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.6801,
+      "step": 176
+    },
+    {
+      "epoch": 1.416,
+      "grad_norm": 0.8195593757551065,
+      "learning_rate": 4.164236482034327e-05,
+      "loss": 0.6585,
+      "step": 177
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.8856301350683881,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.6718,
+      "step": 178
+    },
+    {
+      "epoch": 1.432,
+      "grad_norm": 1.032547376478014,
+      "learning_rate": 3.9553876595915375e-05,
+      "loss": 0.763,
+      "step": 179
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.9696986764115734,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7687,
+      "step": 180
+    },
+    {
+      "epoch": 1.448,
+      "grad_norm": 0.9585486030234329,
+      "learning_rate": 3.750613325319817e-05,
+      "loss": 0.711,
+      "step": 181
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.8487980548445815,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7318,
+      "step": 182
+    },
+    {
+      "epoch": 1.464,
+      "grad_norm": 1.245397825528434,
+      "learning_rate": 3.550051511330361e-05,
+      "loss": 0.7734,
+      "step": 183
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.8973712731694369,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.7002,
+      "step": 184
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.9725393858342541,
+      "learning_rate": 3.3538374102033866e-05,
+      "loss": 0.7616,
+      "step": 185
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 1.117394190148479,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.7912,
+      "step": 186
+    },
+    {
+      "epoch": 1.496,
+      "grad_norm": 0.8912340910228728,
+      "learning_rate": 3.1621032838589305e-05,
+      "loss": 0.6443,
+      "step": 187
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.8866116563120968,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.7704,
+      "step": 188
+    },
+    {
+      "epoch": 1.512,
+      "grad_norm": 0.7749023022858249,
+      "learning_rate": 2.974978374403147e-05,
+      "loss": 0.6558,
+      "step": 189
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.9365999731244843,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.7603,
+      "step": 190
+    },
+    {
+      "epoch": 1.528,
+      "grad_norm": 0.9766004674575304,
+      "learning_rate": 2.7925888170101665e-05,
+      "loss": 0.7267,
+      "step": 191
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 1.0390916143424298,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.8023,
+      "step": 192
+    },
+    {
+      "epoch": 1.544,
+      "grad_norm": 0.9147338948135607,
+      "learning_rate": 2.6150575548982292e-05,
+      "loss": 0.6871,
+      "step": 193
+    },
+    {
+      "epoch": 1.552,
+      "grad_norm": 0.932755369920222,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.7526,
+      "step": 194
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.935429666752721,
+      "learning_rate": 2.4425042564574184e-05,
+      "loss": 0.6711,
+      "step": 195
+    },
+    {
+      "epoch": 1.568,
+      "grad_norm": 0.8845809906688457,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.6961,
+      "step": 196
+    },
+    {
+      "epoch": 1.576,
+      "grad_norm": 0.9995414517223054,
+      "learning_rate": 2.2750452345848682e-05,
+      "loss": 0.7372,
+      "step": 197
+    },
+    {
+      "epoch": 1.584,
+      "grad_norm": 0.9468268923677634,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7459,
+      "step": 198
+    },
+    {
+      "epoch": 1.592,
+      "grad_norm": 0.9145494700163316,
+      "learning_rate": 2.112793368281799e-05,
+      "loss": 0.7345,
+      "step": 199
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.0705558425892678,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.7512,
+      "step": 200
+    },
+    {
+      "epoch": 1.608,
+      "grad_norm": 0.9633908526891983,
+      "learning_rate": 1.9558580265652448e-05,
+      "loss": 0.7458,
+      "step": 201
+    },
+    {
+      "epoch": 1.616,
+      "grad_norm": 0.9217792604616163,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.8034,
+      "step": 202
+    },
+    {
+      "epoch": 1.624,
+      "grad_norm": 0.9439148119334351,
+      "learning_rate": 1.804344994745727e-05,
+      "loss": 0.7709,
+      "step": 203
+    },
+    {
+      "epoch": 1.6320000000000001,
+      "grad_norm": 1.2065917749966415,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.7415,
+      "step": 204
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.9549382902209208,
+      "learning_rate": 1.6583564031206357e-05,
+      "loss": 0.6937,
+      "step": 205
+    },
+    {
+      "epoch": 1.6480000000000001,
+      "grad_norm": 0.9953482294115805,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.7345,
+      "step": 206
+    },
+    {
+      "epoch": 1.6560000000000001,
+      "grad_norm": 1.0391151845768114,
+      "learning_rate": 1.5179906581313064e-05,
+      "loss": 0.7517,
+      "step": 207
+    },
+    {
+      "epoch": 1.6640000000000001,
+      "grad_norm": 0.892304198205609,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.6669,
+      "step": 208
+    },
+    {
+      "epoch": 1.6720000000000002,
+      "grad_norm": 1.0266783762809504,
+      "learning_rate": 1.3833423760302611e-05,
+      "loss": 0.7266,
+      "step": 209
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 1.1356445620964943,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.7033,
+      "step": 210
+    },
+    {
+      "epoch": 1.688,
+      "grad_norm": 1.0125564012072685,
+      "learning_rate": 1.2545023191032801e-05,
+      "loss": 0.7136,
+      "step": 211
+    },
+    {
+      "epoch": 1.696,
+      "grad_norm": 0.9028529514701249,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.7006,
+      "step": 212
+    },
+    {
+      "epoch": 1.704,
+      "grad_norm": 1.0456487739230185,
+      "learning_rate": 1.131557334489326e-05,
+      "loss": 0.7334,
+      "step": 213
+    },
+    {
+      "epoch": 1.712,
+      "grad_norm": 1.0317362890417232,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7419,
+      "step": 214
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.9311035227081702,
+      "learning_rate": 1.0145902956395447e-05,
+      "loss": 0.7117,
+      "step": 215
+    },
+    {
+      "epoch": 1.728,
+      "grad_norm": 0.9331344553753933,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.6911,
+      "step": 216
+    },
+    {
+      "epoch": 1.736,
+      "grad_norm": 1.1060613237728212,
+      "learning_rate": 9.036800464548157e-06,
+      "loss": 0.7269,
+      "step": 217
+    },
+    {
+      "epoch": 1.744,
+      "grad_norm": 1.015541013094166,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.7157,
+      "step": 218
+    },
+    {
+      "epoch": 1.752,
+      "grad_norm": 0.8510540833090268,
+      "learning_rate": 7.989013481394814e-06,
+      "loss": 0.715,
+      "step": 219
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.9268564574728297,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.6754,
+      "step": 220
+    },
+    {
+      "epoch": 1.768,
+      "grad_norm": 0.9207812232471209,
+      "learning_rate": 7.003248288071118e-06,
+      "loss": 0.6912,
+      "step": 221
+    },
+    {
+      "epoch": 1.776,
+      "grad_norm": 0.8179210209673273,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.5982,
+      "step": 222
+    },
+    {
+      "epoch": 1.784,
+      "grad_norm": 1.2179160408590126,
+      "learning_rate": 6.08016935872251e-06,
+      "loss": 0.749,
+      "step": 223
+    },
+    {
+      "epoch": 1.792,
+      "grad_norm": 1.0116883719900378,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.7632,
+      "step": 224
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.9667019863085939,
+      "learning_rate": 5.22039891260262e-06,
+      "loss": 0.7349,
+      "step": 225
+    },
+    {
+      "epoch": 1.808,
+      "grad_norm": 0.9824595257081432,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.6991,
+      "step": 226
+    },
+    {
+      "epoch": 1.8159999999999998,
+      "grad_norm": 0.914323365177283,
+      "learning_rate": 4.424516494654118e-06,
+      "loss": 0.734,
+      "step": 227
+    },
+    {
+      "epoch": 1.8239999999999998,
+      "grad_norm": 0.9171539637252152,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.6559,
+      "step": 228
+    },
+    {
+      "epoch": 1.8319999999999999,
+      "grad_norm": 0.9296078028858084,
+      "learning_rate": 3.693058584855369e-06,
+      "loss": 0.6004,
+      "step": 229
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.9768565124078812,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.6891,
+      "step": 230
+    },
+    {
+      "epoch": 1.8479999999999999,
+      "grad_norm": 1.095726657475443,
+      "learning_rate": 3.026518236595621e-06,
+      "loss": 0.7675,
+      "step": 231
+    },
+    {
+      "epoch": 1.8559999999999999,
+      "grad_norm": 1.0273439273800609,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.7454,
+      "step": 232
+    },
+    {
+      "epoch": 1.8639999999999999,
+      "grad_norm": 0.9590456904201695,
+      "learning_rate": 2.4253447443228106e-06,
+      "loss": 0.6743,
+      "step": 233
+    },
+    {
+      "epoch": 1.8719999999999999,
+      "grad_norm": 0.9745320151965339,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.7886,
+      "step": 234
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.0763904305244638,
+      "learning_rate": 1.8899433406879608e-06,
+      "loss": 0.7467,
+      "step": 235
+    },
+    {
+      "epoch": 1.888,
+      "grad_norm": 0.967581764456723,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.6785,
+      "step": 236
+    },
+    {
+      "epoch": 1.896,
+      "grad_norm": 1.0982009578130143,
+      "learning_rate": 1.4206749233902084e-06,
+      "loss": 0.698,
+      "step": 237
+    },
+    {
+      "epoch": 1.904,
+      "grad_norm": 0.8776694361072739,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.742,
+      "step": 238
+    },
+    {
+      "epoch": 1.912,
+      "grad_norm": 1.19156386832082,
+      "learning_rate": 1.0178558119067315e-06,
+      "loss": 0.712,
+      "step": 239
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.102094357079355,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.7427,
+      "step": 240
+    },
+    {
+      "epoch": 1.928,
+      "grad_norm": 0.9264182869001772,
+      "learning_rate": 6.817575342714988e-07,
+      "loss": 0.6734,
+      "step": 241
+    },
+    {
+      "epoch": 1.936,
+      "grad_norm": 1.0486964114257968,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.7543,
+      "step": 242
+    },
+    {
+      "epoch": 1.944,
+      "grad_norm": 0.9750425672218928,
+      "learning_rate": 4.126066440464982e-07,
+      "loss": 0.7329,
+      "step": 243
+    },
+    {
+      "epoch": 1.952,
+      "grad_norm": 0.9780928365989245,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7039,
+      "step": 244
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.061220289490227,
+      "learning_rate": 2.1058456760891798e-07,
+      "loss": 0.8341,
+      "step": 245
+    },
+    {
+      "epoch": 1.968,
+      "grad_norm": 0.9416367422713224,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7403,
+      "step": 246
+    },
+    {
+      "epoch": 1.976,
+      "grad_norm": 0.9265996957737294,
+      "learning_rate": 7.582748185719358e-08,
+      "loss": 0.6653,
+      "step": 247
+    },
+    {
+      "epoch": 1.984,
+      "grad_norm": 1.0104858211229228,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7525,
+      "step": 248
+    },
+    {
+      "epoch": 1.992,
+      "grad_norm": 0.8765473318432947,
+      "learning_rate": 8.426222418311814e-09,
+      "loss": 0.6066,
+      "step": 249
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.9455283227165583,
+      "learning_rate": 0.0,
+      "loss": 0.6425,
+      "step": 250
+    },
+    {
+      "epoch": 2.0,
+      "step": 250,
+      "total_flos": 81089103593472.0,
+      "train_loss": 1.037638253211975,
+      "train_runtime": 2502.8991,
+      "train_samples_per_second": 1.598,
+      "train_steps_per_second": 0.1
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 81089103593472.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5995eae1b8c90e2ccac3c70c288f75cec71c83a
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..71701bd992146f189da03e0329a3d7922e681da6
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95784096674574f6950ba70e2eedb295d32f854fdd95a4a7454a03e1388f5b93
+size 671150064
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c9e59f4ecc1ec529c37a3d7b3171da8c029880ca
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b29a9500e381d736d83fc8fb868393fcb4882fb12bf439cb83277f333ec14778
+size 918507402
diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..02b10f2cf163875b7056baa99142c333faf3ff2e
--- /dev/null
+++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.9276346784275542,
+      "learning_rate": 2e-05,
+      "loss": 1.7768,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.6481278982394791,
+      "learning_rate": 4e-05,
+      "loss": 1.538,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.7717724000401631,
+      "learning_rate": 6e-05,
+      "loss": 1.6643,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.4798087223498675,
+      "learning_rate": 8e-05,
+      "loss": 1.745,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.5040913189510572,
+      "learning_rate": 0.0001,
+      "loss": 1.6507,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.4179433288448318,
+      "learning_rate": 0.00012,
+      "loss": 1.4839,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.390878794328348,
+      "learning_rate": 0.00014,
+      "loss": 1.5861,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.159005693005469,
+      "learning_rate": 0.00016,
+      "loss": 1.4928,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 1.0655915255744721,
+      "learning_rate": 0.00018,
+      "loss": 1.4568,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.0850599508988026,
+      "learning_rate": 0.0002,
+      "loss": 1.5257,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.0538700008730024,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 1.3898,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.1490879941978314,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.4767,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.002696197709384,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 1.3906,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 1.0994669012710434,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 1.3314,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.1006062488575192,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 1.4306,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 1.1638333681456912,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 1.4754,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 1.0752931115953115,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 1.5654,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 1.4343590589869308,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 1.4925,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 1.024250588988728,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 1.4888,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.9485578932937413,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 1.2813,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 1.0130437917922024,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 1.2961,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 1.1200060173705617,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 1.3012,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.939748034453486,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 1.3703,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.9919126812101607,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 1.4193,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.9686773170539923,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 1.4145,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 1.1428306722186963,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 1.5102,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.9949347203047502,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 1.1556,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.8848229387853778,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 1.3756,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.9900335819187616,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 1.4067,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.9801011347209362,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 1.3651,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.9365165117736447,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 1.4403,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 1.0123569730398672,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 1.4026,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.9736841721199668,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 1.4121,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 1.1071688313996157,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 1.3487,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 1.1060493331445256,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 1.4219,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 1.026938645087035,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 1.3932,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 1.1557966387978893,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 1.3466,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 1.0078215594281974,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 1.3699,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.9811913896716529,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 1.2409,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.8918197387654728,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 1.3851,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 1.068089022611143,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 1.2598,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 1.0478030296296357,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 1.4167,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.8100338238370158,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 1.2201,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.8517143366939263,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 1.1609,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.9142506272664487,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 1.3413,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 1.0536733695691438,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 1.3029,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.9221962771066791,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 1.3558,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.910936924116227,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 1.2835,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 1.1056786632238242,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 1.4781,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.0454075542982686,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 1.4421,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.9283596325199607,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 1.2897,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.9848959751841972,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 1.3819,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.9673118165984793,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 1.4272,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.9432928631761596,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 1.3185,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.961549810649517,
+      "learning_rate": 0.000189241899082916,
+      "loss": 1.3047,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.966460247332086,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 1.3066,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.9154876554983992,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 1.2382,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.9921361426235112,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 1.3354,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.8773066871627214,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 1.2401,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.9892167716904691,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 1.3274,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.9692891455072072,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 1.3451,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.9936653236889128,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 1.3666,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 1.0220940618417156,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 1.3666,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.872041348906631,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 1.1922,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.9828590750917678,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 1.4001,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 1.0032588614829971,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 1.3512,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.9312261542560399,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 1.3406,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.9844326224338974,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 1.2899,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.9865649903509482,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 1.3333,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.83819063618543,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 1.3818,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 1.0551126917051847,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 1.3504,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.8632160203247755,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 1.3005,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.9440754764890599,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 1.2767,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.8735135525555031,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 1.3807,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.856740028229761,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 1.197,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.9264103960369618,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 1.3382,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.9204087723578827,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 1.2589,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.8668385341278854,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 1.3496,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 1.0097296204009092,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 1.4332,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 1.026107451056098,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 1.4299,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.9086670325992544,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 1.2304,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.9338245765912986,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 1.2421,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 1.095641194729298,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 1.3781,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.9084520911757816,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 1.3417,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.9649982305360747,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 1.4502,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.866091858490719,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 1.2619,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.8513403052208766,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 1.3397,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.917755765312962,
+      "learning_rate": 0.00016884803286362,
+      "loss": 1.259,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.8494853247000094,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 1.3267,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.8654871261827237,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 1.164,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.9552195348413874,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 1.3154,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 1.016127615130936,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 1.4602,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.8884802039436105,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 1.3322,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.9107081196079571,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 1.2707,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.9518776066420643,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 1.3673,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 1.0212478759273667,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 1.1936,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.9056282973849618,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 1.3772,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.8251877121032121,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 1.1911,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.9289746754002959,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 1.3496,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.9128957367242889,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 1.2479,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.7965044140608732,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 1.1455,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 1.0545930599310702,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 1.3209,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.8852713683280387,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 1.2693,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.9174434467161392,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 1.2562,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.8400842605770039,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 1.156,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 1.0611988942111712,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 1.315,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.9147798982224843,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 1.3202,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 1.0010578693236878,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 1.3362,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.9000872074490198,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 1.3446,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.7920512263187361,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 1.2624,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.9461381629633617,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 1.3432,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.9492837163279607,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 1.2902,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.8488571384929292,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 1.1759,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.9186243276864146,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 1.3183,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.9045365204837359,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 1.2769,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.8145440663745105,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 1.1665,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 1.0801018363645023,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 1.2615,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.8347270943328522,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 1.1709,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.8953758008715978,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 1.3476,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.9339933247904735,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 1.2882,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.7712534114542834,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 1.1018,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.8307908587514617,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 1.3319,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.8672221825940479,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 1.2412,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.8485691861687462,
+      "learning_rate": 0.000137546377942393,
+      "loss": 1.1078,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.819638464973109,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 1.2004,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.9482780315861542,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 1.4392,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.8086620843754946,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 1.1491,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.8707780475245294,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 1.2727,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.9425633093531728,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 1.2641,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.9528069327815744,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 1.381,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 1.032159872857148,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 1.2908,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.8591875555532374,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 1.2944,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.8669588989282161,
+      "learning_rate": 0.000128717230790931,
+      "loss": 1.3257,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.9175575752707962,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 1.1591,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.916532881566656,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 1.3507,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.8344006687821425,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 1.238,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.8908164072105582,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 1.2133,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.8258874851557738,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 1.2407,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.8168770012860906,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 1.2627,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.9672601465907635,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 1.4012,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 1.0297331391395985,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 1.2051,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.8954821340132485,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 1.1384,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.9673654445988941,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 1.3527,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.9265597815605019,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 1.2943,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.8175004558718472,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 1.2177,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.8333702019225635,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 1.2051,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.7569650720830139,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 1.1324,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.9357586879033147,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 1.2851,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.8527284834939162,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 1.2396,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8594577247326713,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 1.3016,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.8688050502279938,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 1.2143,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.933697227598257,
+      "learning_rate": 0.000109348690758,
+      "loss": 1.3447,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 1.0161891719030802,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 1.167,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.926586483980676,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 1.2326,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.7956334969811154,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 1.1309,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.8239994579933885,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 1.1648,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.8348427384230527,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 1.357,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.8258203560381552,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 1.2152,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.8482515431381136,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 1.1935,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.8417744852317112,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 1.3203,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.8611627618982554,
+      "learning_rate": 0.0001,
+      "loss": 1.2677,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.8406854312604937,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 1.1903,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.8067394788759179,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 1.2452,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.846955968603144,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 1.3268,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.8314645079471286,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 1.2503,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.8366588389815677,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 1.244,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.9291738661860993,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 1.3705,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.9352632281589096,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 1.1825,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.794306846381449,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 1.1809,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.891848905066318,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 1.2885,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.9009410327369045,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 1.2426,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.7983808800209792,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 1.1466,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.8158403213845199,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 1.2988,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.9028836450931733,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 1.1745,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.9192920894100809,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 1.2563,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.9073659197361555,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 1.2116,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.8601282806731785,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 1.2211,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.7743572852622722,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 1.1189,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.9731152954185913,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 1.2922,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.8269711251624151,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 1.2394,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.8636114941268018,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 1.1756,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.8001603451805631,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 1.3446,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.8294495413728634,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 1.1428,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.7970768519309458,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 1.1597,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.8343954735798168,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 1.1475,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 1.1367065956930875,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 1.1926,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.7774104715732815,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 1.1298,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.8255964324237054,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 1.2593,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.7843123105664823,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 1.2835,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.7981085568864953,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 1.2071,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.721122994064396,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 1.183,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.8989434699612031,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 1.1542,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.8142294986053386,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 1.1944,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.7911113853918703,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 1.1117,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.7173845638795395,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 1.1267,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.7854681270504528,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 1.1885,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.8331468548214547,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 1.2341,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.816416495817863,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 1.0917,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.8666692389516789,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 1.2418,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.778466139322238,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 1.1086,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.764141246873744,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 1.0999,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.7766672707410954,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 1.1619,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.8356257804017168,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 1.1816,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.8983453768366347,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 1.2263,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.8247821633222613,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 1.1575,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.8287735532750757,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 1.1057,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.8095241240636343,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 1.2601,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.8479222640578135,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 1.3422,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.8010885490765423,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 1.1548,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.801934289730396,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 1.1854,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.8011647459338709,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 1.3251,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.7679823492923111,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 1.2011,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 1.2209222931014045,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 1.3057,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.8172392109293101,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 1.1466,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.8084414078781809,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 1.2086,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.7799909624044505,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 1.0911,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.799724183341175,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 1.1077,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.9004210665974285,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 1.188,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.883984385574042,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 1.1851,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.8981544710149507,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 1.1885,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.8067748995831314,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 1.2239,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.7961366302755665,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 1.1601,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.9390958958344794,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 1.188,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.8775126473919165,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 1.2275,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.7760005346742462,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 1.1591,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.7834072172886265,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 1.1553,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.8443666697676242,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 1.1369,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.8014359622570144,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 1.159,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.7616605489390753,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 1.1311,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.8969179735835168,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 1.1737,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.9018871934259612,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 1.1813,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.8382979101420402,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 1.1886,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.8018934631010715,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 1.2241,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.7390291843770445,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.9976,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.9266438365579518,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 1.2341,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.861065268104146,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 1.2627,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.8204617046880526,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 1.178,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.9265792236123463,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 1.2555,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.8613593275799136,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 1.2695,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 1.1038814044988352,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 1.1719,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.905407690012056,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 1.1587,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.8002036080708511,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 1.0718,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.7551531161165965,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 1.0928,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.725768166113705,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 1.2068,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.8109713482708016,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 1.1437,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.8117450836482087,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 1.2281,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.8523708555547533,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 1.1589,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.8080157824825525,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 1.271,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.9926926564662885,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 1.2264,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.8747788681716573,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 1.1876,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.858557919178996,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 1.1884,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 1.1027947452090465,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 1.3269,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.9616315049301722,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 1.266,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.8627891810031095,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 1.1313,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.8603099014912703,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 1.1797,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.6788146705314012,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 1.0931,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.8343403001633107,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 1.2136,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.8155842206759587,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 1.0883,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.7596472178159187,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 1.1554,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.8202439148773495,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 1.1459,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.8370741064646355,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 1.2309,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.7976979167753185,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 1.1261,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.8128289382009063,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 1.2263,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 1.0706639890399756,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 1.2743,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.8111350315322916,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 1.091,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.8645351791577299,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 1.1923,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.8831423664098689,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 1.3053,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.9087013463848541,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 1.1431,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.8453476640976382,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 1.1361,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.7991537010464119,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 1.2065,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.8294822038333,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 1.1782,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.903460681143856,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 1.2391,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.8322544796923445,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 1.1208,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.8384660345857029,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 1.2535,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7931126532809545,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 1.1714,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.844869330986358,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 1.1538,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.8709269057729458,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 1.2546,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.7558598839450004,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 1.1193,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.8500033430267443,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 1.152,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.7729090524617063,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 1.1098,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.7643772648840976,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 1.0279,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.8276269212663161,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 1.2287,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.7807491891276336,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 1.0979,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.7825104357246059,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 1.1803,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.7874570354765664,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 1.212,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.8623185804959949,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 1.0947,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.8211354353553579,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 1.2393,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.7681659932492936,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 1.1523,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.8577824703991259,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 1.2206,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.80270605462384,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 1.057,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.7348124182641901,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.9738,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.9447233425970715,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 1.1511,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.8043668353555504,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 1.1463,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.8339332728481555,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 1.213,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.8401094805427077,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 1.239,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.8883832277628466,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 1.2046,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.9959065976233928,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 1.1497,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.7880542961669169,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 1.1489,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.8092589785189201,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 1.1295,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.7317415886804474,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 1.1087,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.9057811805320088,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 1.12,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.7744919130570085,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 1.1901,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.8641892028445843,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 1.1875,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.8278884975053862,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 1.1478,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.8878776299342533,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 1.2387,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.8175876526171806,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 1.1862,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.7199639827824592,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 1.1295,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.9136510681471197,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 1.2803,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.767248727443108,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 1.1746,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.8395636925652074,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 1.2465,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.9605079921621236,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 1.3639,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.7456669378605656,
+      "learning_rate": 0.0,
+      "loss": 1.1567,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 102004528611328.0,
+      "train_loss": 1.2587099098242247,
+      "train_runtime": 3131.9625,
+      "train_samples_per_second": 1.596,
+      "train_steps_per_second": 0.1
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 102004528611328.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fd299d79b29b6a152f1741aab425b54aadad65f
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "k_proj",
+    "up_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0e3ace7fdce87934f95075283e224ccd6395530c
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f35b404e5f5d58ff8b8b5a7a22896100bbdc8a816d6323849bed80b5bcc2dc2c
+size 671150064
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..11f9e2fc695df84fc5f15eef7c6cf87b419cc570
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0f4f8316bacd73d83b96d0e1cfb5b58cad4c4aadef49fe67a33aa091a5e9072
+size 918507402
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..66206d707f1892425ae16f72e179c3368be5953a
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.7110440684465407,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 0.9068,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7402343530118121,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 0.9466,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.700186513981636,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 0.9259,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.5505407762435491,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 0.8359,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.4862152844581287,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.8108,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.4309565303252267,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.7751,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.6187148335727641,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.8066,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7302821056119174,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.821,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5060929149374445,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.7962,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.4458004825377806,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.7821,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.4089589669778015,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.7828,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.3601723384901577,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.6884,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.40858125080193214,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.765,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.34659150917180453,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.6784,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4154170252774343,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.7487,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.35305856643040606,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.6666,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.4051307765789964,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.7008,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4704633484539053,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.7038,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.35216674124673664,
+      "learning_rate": 0.0002,
+      "loss": 0.699,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.3940341166792085,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.7319,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.32907304141610794,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.6622,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.36339116687554524,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7147,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.3807439948921444,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.7533,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.3871403568340088,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.6999,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.3839673604576073,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.7195,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.3441053998710712,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.7251,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.35243220597752656,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7196,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3636373360678986,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.7013,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.38690640735599346,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7257,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.34432537602176394,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.6949,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.33886239466871904,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.7076,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.39604847662347764,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.709,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.3554397918223366,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.6936,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3006245693098016,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.6309,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.3460179345437749,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7033,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.3580661160694404,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.6624,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.3675642471853245,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7166,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.33195058427509394,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.6981,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.36505875925567893,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.6965,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.33771080687308747,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.681,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.34732300147085565,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.6926,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.3892677134753642,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7061,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.41174857195156683,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.727,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3261432831176223,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.6659,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.39023105588833745,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7317,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3470903187488154,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.6618,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.44533985824170724,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7189,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.38820559490715234,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7184,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.3309722521348095,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.6738,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3500223436629588,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.6605,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.3667381135063131,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7317,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3302936992504295,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.6474,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.3679651518546313,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.6687,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.33494114144170967,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.6662,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.40296578279584266,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7072,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.32730591928259145,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.6951,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.3062729660279465,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.6254,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.47647017324525437,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7032,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.32627709157890566,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.6167,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3329798568108714,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.6676,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.3891963711757699,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.6965,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.33183293946745057,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.6392,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.33912037930238675,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.6356,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.32664447889845427,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.6339,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.3300921726396346,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.6419,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.32753730385613744,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.6538,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.3595156771897832,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7021,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.34599170888547853,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.6761,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.3072498490442669,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.6617,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.29256884683023665,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.6049,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.35571382028549814,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.6909,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3617050838266649,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7114,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.3401169135033129,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.6848,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.35361065732811076,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.6923,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.3583639301348245,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.6702,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3036815741783258,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.6226,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.34885191390743076,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7084,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3282951690750017,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.6456,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.3676583612320544,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.6878,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.33870275217077395,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.6961,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.3239305669676888,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.655,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3036751743254618,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.6076,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.3299563834929254,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.63,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.345960064508727,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.6894,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.3188538305366772,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.6605,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.317225148263491,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.6245,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.3215959016705841,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.6901,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.39325835809764975,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.6956,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.3222902742014463,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.6066,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3348215618090499,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.6664,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.31538169493425555,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.6534,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3129197934323035,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.6362,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.48046051152079117,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7376,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3750084153380919,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.6733,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.28943781471618396,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.6143,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.3697511920947507,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.6227,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.3220730779992305,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.6422,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.32426781868870586,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.6473,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.36372082103869735,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.6677,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.32810076407887667,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.6241,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.32386807063212686,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.6301,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3220906865944551,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.6106,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.40958173832989664,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.6824,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3416481259773205,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.6465,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.3501750365308928,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.6715,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3933598149181734,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7267,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.3477897871913843,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.6765,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.34931633673439555,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.6476,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.3615756469744839,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.651,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3588830035989954,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7245,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.32400355120523916,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.6511,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3139049955415279,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.6874,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.32974412650898166,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.6542,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.32521135211297236,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.606,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.3274458228827635,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.6197,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.38249267054912756,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.6922,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.3993895495653672,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7049,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.35465045760647435,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.6708,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.3366336875124197,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.6431,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3490649974959498,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.6577,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.3753710847840562,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.675,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3236435175590241,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.6544,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.3344044866776899,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.6665,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.40250024802517503,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.6715,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.3210471159114599,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.6152,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.30939342647725143,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.6417,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.34480787064227664,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.6839,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.34282744371958,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.6705,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.31002911403922345,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.6157,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.327253325750313,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.6641,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.3239163540448087,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.6544,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3352972454556464,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.6324,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.3578753303826149,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.6436,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.38389784430305535,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.6577,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.33577156813804415,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.6351,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.31939866459158905,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.6207,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.3411423086425535,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.6639,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.34919337262794314,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.6558,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.3562855429858843,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.6677,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.36959206915419046,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.6552,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.36018173281575633,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.6277,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3312018888377899,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.6228,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.34710619478939314,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.6397,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3616780762901335,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.6647,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3632272942971575,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.6823,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3438490233014574,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.6479,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.30793762863839114,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.6031,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.32997593549516097,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.6577,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.3292567598218092,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.6773,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3489678086055255,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7063,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.324031447810286,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.6527,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3569167593361834,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.6576,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3112375499035634,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.6022,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3480171639311505,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.6312,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.33785670131306333,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.5784,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.32973378317808705,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.6269,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.321638262841785,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.5982,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.33916451990137586,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.653,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.346726007252727,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.6409,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3499513321280793,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.6522,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.3166028999401929,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.6542,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.33194231087455445,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.618,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.3071551680466884,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.5776,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3176933297860166,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.5975,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3236083324367126,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6001,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3242228487517961,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.5985,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.3433505723805961,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.6402,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3142632180901461,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.619,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.3650080087368739,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.6645,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.33213013539789776,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.6311,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.32299500479841137,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.6439,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3325685962656323,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.6388,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.29601692313967864,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.5875,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.31069581062561197,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.6235,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3335052906646829,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6386,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.315143057504453,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.6057,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.34013732175970035,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.6102,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3392534389907391,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.6176,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.346885901966819,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.6773,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3110542617842148,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.6189,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.33175430427217634,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.6374,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.32724307311049633,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.6631,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.33837839138182874,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.632,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3346809848936033,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.5922,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.32538956568562816,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6399,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.30362553981973994,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6159,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.3379322372857405,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.6059,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.29726833260616514,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.6127,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.30324116876677487,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.625,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3116739250863844,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.6415,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.31621045811587917,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.6105,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.35486237890466066,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.666,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.33064598256285993,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.6874,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3180002664113174,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.6281,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.3305608635057252,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.6225,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3347652151951757,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.6242,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.27991874031675124,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.5405,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3162354099409061,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6009,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.32486415305433153,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.6002,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3646449871084355,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6499,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.31688261087946556,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.6017,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.33798344241209227,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.6342,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.2998562294365411,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.6216,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.33028086875759405,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.634,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3474490101498229,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.6173,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.32937156718869864,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6257,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3219749951301766,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.662,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3561197924950446,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.6569,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3244256712146143,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.6594,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3381412265786872,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.6464,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.3238669811135605,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.6027,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3280219085314321,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.6288,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.3685876179481929,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.6255,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.29270701118818704,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.5863,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.3474896849101967,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.6566,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.32082996987901913,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6429,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.30825421233708616,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.5864,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.36395065272004196,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.6453,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.34726778931857044,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.6858,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.33674366113332843,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6295,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.3674013055955959,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6069,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.31996724678426863,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.5908,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.31666566972160265,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.623,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3677352283800213,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.6453,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.3574559425690372,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6501,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4988636076953785,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6514,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.31060166601964173,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.6134,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3472702683654676,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.5882,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.3463745142979321,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.6492,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3479775333527672,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.6677,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.33334390558230875,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6231,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.30961419862981415,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.5667,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3094987062712464,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.5594,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3159953848404255,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.5843,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.3665073287523895,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.6432,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3463574230115457,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.6306,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.31587888241197065,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.5616,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.35804068720403287,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6201,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.30834708658834076,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6296,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.32627377791243184,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.6205,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.31645951528801003,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.6248,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3104451415173383,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.6095,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.2890888494258903,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.5883,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.29338240459411014,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.6043,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.32262457740542627,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.6419,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.33009135754906616,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.6093,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.3158757926032022,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.605,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.31245531255722064,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.5791,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.34480793865812576,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.64,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.34835203084995453,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.6129,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3433649816275233,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6378,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.34142040864867906,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.6127,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.33077044742523903,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.5955,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3319211688043142,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6164,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.3215426812914164,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.67,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3272276807128324,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.6543,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3388778535567206,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6384,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3268286368522702,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6395,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.3279167955970497,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.5976,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.34207456240072015,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.6101,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.31432358611256983,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.5871,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3097995350253823,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.5971,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.31957824226435905,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.6141,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.33556461316683844,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.6184,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3257640201823845,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6043,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.32743782814789024,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.595,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.31806595157011563,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6115,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.30977554761088855,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6193,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.3342560970887094,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6232,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.33069944231914344,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.6129,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.31480071435918866,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.5862,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.32243920235963874,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6247,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.33812522280669727,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6527,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.33089269374465646,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6002,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3646236367929071,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6411,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.2799400640797182,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.5564,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.32703269570958576,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6225,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3629491452820718,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6047,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3451267184380675,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6548,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.31381151848276595,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.5983,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.34476408803594877,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6573,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.32193052407965933,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.5542,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3226898058737763,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.6068,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.31976219084762086,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6125,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3252185719704812,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6061,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3326884379691767,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6147,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.33492432797209265,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6123,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3164586148445968,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6251,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3090687482048777,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6153,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3128302069224832,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.6185,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.29351845518854947,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.6018,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3410055854158688,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.5789,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.3141087954157077,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.6005,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3390354929842119,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6353,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.296516570958744,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.5377,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.33066364468327153,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.5861,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.33208494496920954,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6163,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.32052779611504956,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6017,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.38249530580741814,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6612,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3066619083863857,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.5741,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3051923913503972,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6014,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.30702785701491847,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.5953,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.2979662489245293,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.5848,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3161998126919067,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.5741,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.3253871022364271,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.5985,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.34784480437776627,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.604,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.3183414818409375,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.5785,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3597552732010806,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6179,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.32689548044171374,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.5677,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3537374631617811,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6252,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3069063861327447,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.5801,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3655647459141965,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6507,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.28855866544087244,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.5789,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3310666913511004,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.5927,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.3128799657254568,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.5774,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.45241050341416117,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.6626,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3121988566030735,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6474,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.2987472152843115,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.587,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3330186838552484,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.5733,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.38804357512944315,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.5818,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.29517399138629286,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.5785,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.30226149985906536,
+      "learning_rate": 0.0001,
+      "loss": 0.6248,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.28609710190348236,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.5425,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.29415325411634424,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.5756,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.33285141117958783,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6037,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.30157945611754017,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.5466,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.34447034812304733,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6158,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3294249930520986,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6006,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.33164246733032615,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.5825,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3107641846660284,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.5848,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3145409558213135,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6177,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.31753153253665634,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6091,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3157578704685954,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6084,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.29476395762907864,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.5841,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3209582290404505,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6408,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.40444408574813595,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.5806,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3319189186745888,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.5628,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.37186268404405437,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6173,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.32202567918702646,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6355,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3566642646810941,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6211,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.33734408956869044,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.603,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.33271896050771005,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.5862,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.3140618334759957,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.5936,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.30530598221384914,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.5852,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3720165481328377,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.621,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4127542577896669,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6037,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.29375796385920805,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.603,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3154118064740421,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.5921,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.3372373705593303,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6117,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3133153360665321,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.5629,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3123915646510453,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6136,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3091600893466998,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.5723,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.34631992093275005,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.5251,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.2980341545388974,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.5809,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.34817708486534094,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.568,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.35450939715310636,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.5828,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3203799738608221,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.5698,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.33422873597279046,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.5796,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3338188435622777,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6019,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.328280335637955,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.5858,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.33567994116448024,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6026,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.31031837812977875,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6125,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.32926091956212744,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6006,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.36993353060267264,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.5733,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.31499830986752403,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6259,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.32188347085701724,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.5917,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.3061945600647162,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.614,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3073702794282459,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.5617,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.30515399425767387,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.5912,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3355739467311519,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6067,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3468971034802618,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6384,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3116793267107946,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.5852,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.3047471597740809,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.5636,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.29591949315461463,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.5955,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3241793514631215,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6125,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.2969675059082848,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.5837,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.2895976804540707,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.5228,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3330341571035036,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.5934,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.37117430157855713,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.573,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.33656138401884717,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.5979,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.32473469764325097,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.5832,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.31633341000198745,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.561,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.3543881916244574,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.5922,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.32944401599102163,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.5781,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.29379043631370816,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.5585,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.30087163699355657,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.577,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.3305440990682953,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.5972,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.32808249115703125,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.5646,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3579019126827201,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6178,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.31229957334655506,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.563,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.30876924398557587,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6047,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.334833827162927,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6016,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.3154101243775418,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.5769,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3156947224159009,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.5603,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.30358467735576816,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.5619,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3747209131021904,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6279,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.34430479121130414,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.5865,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.32400213742109385,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.5477,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.385300934700712,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.5844,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2966689332023562,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.5705,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.2931038318652647,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.5782,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3351747643450468,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6056,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.30684605499457235,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.5488,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3279054154311602,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.5839,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.30962367293197873,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6076,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.31679931254330107,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.5871,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.32141908822706244,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.5469,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3527971138131436,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.603,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.27369281915172705,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.5474,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.2905310605819737,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.5533,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.296175952659622,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.5147,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.2951710811283539,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.5362,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.33059665402204946,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.5844,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.30176317940139036,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.5507,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.30774773664578586,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.5282,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.5117447996571544,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.5492,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3421289435965022,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.5875,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.354463766375705,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6217,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.3154260452857168,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.5499,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.2969536159605783,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.577,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.31177957234143505,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.5569,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3019463727112216,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.5822,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3058338570000804,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.606,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3230837429064014,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.5651,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.2678609967121915,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.527,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.31984197931050856,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.5277,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.36787971775645245,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.5831,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.31761183735193627,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6092,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.3209030010567008,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.5839,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3048133128165885,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.5736,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.28120456740912425,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.5471,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.37214820992626707,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6119,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3408766459891202,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.5843,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.27993522238408386,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.5368,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.33430364727703993,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.5607,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.31370924363308805,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.5796,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.3158247100165067,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.554,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3293365360340102,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.616,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.3102964238553234,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.5566,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3320311237718539,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.5976,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.39732095318780175,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.5625,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.31155382051607555,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.5411,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3323104297517485,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.5865,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3712237621744867,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6318,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3229042644926411,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.5743,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.29907881399082403,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.5664,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.4441456508733791,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.5882,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3519351419650644,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.5916,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3046891795610562,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.5939,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.28455026675791334,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.5188,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3325770886862265,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.5869,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.2973809949258795,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.5421,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3221347374942181,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.5864,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.34687591239754206,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.5944,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3352608379297791,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.5771,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3305224911439026,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.5905,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.2965071411303001,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.4929,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3527451252706239,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6096,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.30724814051450583,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.5701,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3469545333516701,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.5809,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3432172543820323,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.5815,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.27208246581981216,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.5307,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4100518601876637,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.5817,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.34061629245083513,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6113,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.30675429430291684,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.5424,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.31742149090771027,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.5611,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.324121004647133,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6087,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3289709513645346,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.5866,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.3039839724902563,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6005,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3227359743275398,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.5755,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.3312219578229909,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.5951,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.2607892687043467,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.5239,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.3211451583419221,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.5996,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3261838860983297,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.5394,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.31184360385586457,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.5579,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.352870753034346,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6094,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.2783628041575873,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.5212,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.321237438184481,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.5614,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.3019928769523239,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.5561,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.29832262198894516,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.5377,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.323634424531092,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.618,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3282204388290083,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.5931,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.35625417801874015,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.589,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2844237915466193,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.5143,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.29732264384556584,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.5476,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.2856189609161265,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.516,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.33400754346314043,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.5725,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.329844208780943,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.5457,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.29412208186368505,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.5305,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3067167595539764,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.5952,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.31507436481266066,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.5721,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.374802888789554,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.5098,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.32859006997932044,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.5806,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.32874810609039673,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.5841,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.28836419309288286,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.5541,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3099871606982036,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5606,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3221853052094363,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.5389,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.2894838671575457,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.5381,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.30988958158731844,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.586,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.31132930659044444,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.5506,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.3263325087242562,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.5206,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.35348868141276724,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.5538,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.33519734489221437,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.5787,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.35515766666587856,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.5603,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3538496256081159,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.5965,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.34314938114488563,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.5603,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.2995789633564441,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.5263,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.36551548091125113,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.5799,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.33608376035602755,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.5498,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.311043421155776,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.5645,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.3156023943290121,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.5488,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.33536823334656135,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.5563,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.33526801042681004,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.484,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3313356108133414,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.5671,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3032308382473139,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.5313,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.35514370450316773,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6026,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.32561672086420923,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.5632,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.32605764722031383,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.5493,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3300675802793695,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.5661,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3149622536766178,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.547,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.36686698274862983,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.5503,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.38360774512659207,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.527,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.33270680502856625,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.5638,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.31349166170125337,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.5654,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3337236613807481,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.5681,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3284719148971575,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.5755,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3166628892383498,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6076,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.2923848352653842,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.5732,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3009077815189411,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.5436,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3130027772871504,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.5625,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.3361962977258804,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.5807,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.29608833926553046,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.5269,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3270524860119114,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.5816,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3430437346896367,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.5731,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.32535876490467913,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.4918,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3231292747667162,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.5872,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.3251794730113907,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6055,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.31024411607401303,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.536,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.32455636870036475,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.5415,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3150003047627516,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.5166,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.29348215525540744,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.4773,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.292859851123708,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.5636,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3300194120158872,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.5522,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.2963361079298774,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.5493,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.29879896580398585,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.5591,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4116034082913684,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.5883,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3232773360036941,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.5737,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3134894851722256,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.5848,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3528480304212816,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6011,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3310343277309279,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.5872,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.29087492472336945,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.5401,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.30926114646884223,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5854,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3219895811423112,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5649,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.32051340587736893,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5897,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.29727631379917907,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.5261,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3034336648188552,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.5318,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3075346806380729,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.5652,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.2809666320569592,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.4978,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.32906249814027355,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.5675,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3433895713502097,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.5659,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.299004282903824,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.5388,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.31081135142207966,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.5737,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.3321533336299761,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.5229,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.32310942065808324,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6103,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.35428031176295227,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.5895,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3226230204094095,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.5796,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.31844821904730863,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.5565,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.31127211469742905,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.5742,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.3301616178055294,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.5633,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.31088406778226274,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5741,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.30910524243719273,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5422,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.32891907673013904,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.5939,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.31765925114121074,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.5525,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3265729180708036,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.5591,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.304115871463315,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.569,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3108161445075911,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.5302,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.3117508517796498,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.5924,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3173885393960302,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.5513,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3065343745357514,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.5488,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3465215111780123,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.5718,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.3139184869222331,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.5634,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3347273007314033,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.5388,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3255954139922336,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.5913,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.31531159165403394,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.531,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.30838344199204837,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5312,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.31304073052384795,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.5543,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.28806534227726394,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.5099,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.30803254142561787,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.5429,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3233273532446496,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.5475,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.314011051336649,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.5825,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.31288304219875024,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.5351,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4138905691158364,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.5927,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.33771234928093735,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.5824,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.29905255464308417,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.5495,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.40814635147033207,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5305,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4552099236409849,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6151,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.29847599170770306,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.5433,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3107431078356281,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5635,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.2993825608025531,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.5608,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3220569129147483,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.556,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.2944064570193302,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.5398,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3095022936191454,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.5168,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.3721121088717959,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.5385,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.287888409393853,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.5547,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.27515729289166646,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.4845,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.35344587392267507,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.5638,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3601029556024769,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.5279,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.30954690722928574,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.5541,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.29250672200275046,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.5425,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3679308518468009,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.5775,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.2986869169097338,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5344,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4446774451930178,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.5608,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.31519624442850597,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.5073,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3073412869268294,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.5422,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.3498546428328075,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.5048,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3227086733836615,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5576,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.3131763041284998,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.5746,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.30994026071777964,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5458,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.304032587550163,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.5185,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.32560298395148213,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.5953,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4316946416052306,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.5302,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 1.148032759175294,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.5889,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.3273908378003577,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.5348,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.29324728887148505,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5362,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.33246322889785346,
+      "learning_rate": 0.0,
+      "loss": 0.5623,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 617553363992576.0,
+      "train_loss": 0.6094743517398834,
+      "train_runtime": 9982.4112,
+      "train_samples_per_second": 1.002,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 617553363992576.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bd037a10dfc12a3cd60f9e2e9e7b84b48b58a11
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..95ed70cf0151c04d096f86686598432852f2757a
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d18065781693bb13fdd4485b21e7f60e2b9088ff799e15113a2288f346cce37
+size 671150064
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..230329cf72d3a62bdb02956b5934a079996be505
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e580902e5072fe04b7f4500f34f52d53625f8d78936978957c8fcbdc12bfe68f
+size 918507402
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5745ffc710cf2db3994d48173e15137dedaef571
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.7333467964438457,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 0.8541,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.7046812923627103,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 0.8938,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.8023384874584811,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 0.9444,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.6592371105137336,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 0.879,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.5489801752269747,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 0.8878,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.49330057949697514,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 0.8441,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.4514470086044233,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 0.8222,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.46072800657223917,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 0.8425,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.5343209805429193,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 0.809,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.5681041941951939,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.8089,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.5984329597457433,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 0.8328,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.5019398392065417,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.7657,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.4568875591342637,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 0.813,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.4638574433579869,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.7875,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.45583896124706985,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.733,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.44012367968398347,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.774,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.6381321163246892,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.7551,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.38086721843816257,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.7197,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.41242103873065866,
+      "learning_rate": 0.0001,
+      "loss": 0.7259,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.4031452967079881,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.7345,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.3796337899011198,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.7308,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.3589672408280759,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.7252,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.36779673876973484,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.7553,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.3771478046107944,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.728,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.4059869425404555,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.6706,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.41816849965572295,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.7328,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.4083488253667742,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.7246,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.3806308894022481,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.7368,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.3686860299373134,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.6777,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.3911767574920794,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.7319,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.33941469797174856,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.6488,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.33953284273489737,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.6693,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.32790137430310623,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.6598,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.366265210647376,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.7223,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.3710081737672853,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.6841,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.3662173246501666,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.6701,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.38472592055372573,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.7235,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.33667608978973407,
+      "learning_rate": 0.0002,
+      "loss": 0.685,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.3245830016683275,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.6846,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.3622644699856565,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.6687,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.38426542618145665,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.7106,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.35786200143014124,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.7038,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.4612420884227193,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.7225,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.4006770487146382,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7333,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.3733574872750548,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.7105,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.3448576640940017,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.6873,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.3543916842716417,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.7174,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.43857513510849166,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.706,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.3789729781625536,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.657,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.3577421848104737,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.6551,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.3729045151309279,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.6635,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.36050892352274594,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.6734,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.33619674572344266,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.6697,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.3511748484935949,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.6932,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.3338714829275817,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.679,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3546683605903518,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.6951,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.34590425068052655,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.6999,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.34937606227550067,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.6687,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.3401995704397724,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.6405,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.38521477320962205,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.6971,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.36610187767563834,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.7103,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.36747178830589144,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.6695,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.36260599210373223,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.7015,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.34234793606854774,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.6397,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.3502511952025378,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.67,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.3654449065132795,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.719,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.3789789626981662,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.7189,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3315091880490501,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.6778,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.3786473692725525,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.6965,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.36763485908472676,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.6963,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.35592546585794316,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.6586,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.34593530293633556,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.71,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.31652977117815895,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.6175,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.3279341370437573,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.6442,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.30589567213978114,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.6053,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.37411511157136124,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.6604,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.33254333104980854,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.6504,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.34114993306366115,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.6446,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.3660399567176618,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.654,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.3401691894450481,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.6689,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.382049598789673,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.7018,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.37980106479306164,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.6632,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.4024825236670562,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.6724,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.35981220589930224,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.6421,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.3290097261427852,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.661,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.3512867136385803,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7121,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.3223343141825949,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.6741,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.33715632997441913,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.6758,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.33980240202859846,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.6961,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.33840677725516743,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.6802,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.34861377177855213,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.6518,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.34992870540252374,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.6477,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.38417763225511253,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.6532,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.36598538222084,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.6674,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.3142884383546262,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.6046,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.3538134463621248,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.6937,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.31998281722492944,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.6391,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.36592875585738904,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.6818,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.362517266127021,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.6487,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.32690240419691086,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.644,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.31337521433189724,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.6405,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.3536448102446704,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.6443,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.3838886237343212,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.6911,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.34521287318071664,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.6639,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.3885827888518703,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.6821,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.3774208870941129,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.6477,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.3487294577299662,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.6402,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.3675667985641538,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.6411,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.3858224565215601,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.6396,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.3312217529929406,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.6141,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.334488409574564,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.6182,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.32561586510457413,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.6181,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.35314824041217946,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.6561,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.40543110997691756,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.6565,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.34572707660144664,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.635,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.41880695361859205,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8094,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.4314435242035476,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.6529,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.3458276721677694,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.6085,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.345832220010251,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.6492,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3832504293765825,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.6623,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.36633565364216164,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.6423,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.3829393918649457,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.6464,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.34672910506272275,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.6458,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3852087511837911,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.6585,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.33913584509396805,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.6459,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.3478104764584854,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.6529,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.38015962327569774,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.656,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.31846132254752074,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.6191,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.3090484661720633,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.6191,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.3309332991113378,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.6376,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.34607157007406497,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.656,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.34154025516290404,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.6613,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.34621999136149534,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.663,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.31938959567418773,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.6392,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.354949769989008,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.6416,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.38189814518037746,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.6921,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.3659263446214391,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.6499,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.3328330282062205,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.6315,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.32116564042899076,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.6258,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.37785042372798666,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.6576,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.3094529863079494,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.613,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.3137324032253774,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.6249,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.38180488121497574,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.6766,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3644983311374808,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.6301,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.3317008198946184,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.6577,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.3694225450245007,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.6587,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.3828605568034377,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.6622,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.36181988330545833,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.6683,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.35047126739354484,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.6871,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.3308112712054644,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.6832,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.33134319006255447,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.6696,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.34666324653052366,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.6725,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.36207621794732525,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.6732,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.33889345106741003,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.651,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.35159449636801615,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.6571,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.35584438905062943,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.6717,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.35199238498112145,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.6468,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.31843385563990295,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.5959,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.3740596945424377,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.6538,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3550232628004775,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.6348,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.3486879260513687,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.629,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.38571634904297875,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.6244,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.35061083378189084,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.6643,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.33223226481959944,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.6149,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.32043302030456344,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.6205,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.32250710950305245,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.631,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.3171309934126043,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.6219,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.34830888209551053,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.6399,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.34760275654822936,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.6128,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.33888876826964004,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.6224,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.35209782663951905,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.628,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.33820840008987435,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.6359,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.3664340083977849,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.6673,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.34094112806828475,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.6589,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.33821570267165935,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.6574,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3186428146376564,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.5942,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.3451631464189648,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.6211,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.35343049372918955,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.6467,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.36376785759597074,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.6669,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.35064007261406416,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.6569,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.30431346771152046,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.5859,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.3311512501444353,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.6157,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.34243549624437286,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.6625,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3410091522230089,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.6946,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.3421852451349417,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.634,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.32865521453219587,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.6529,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.34062067626916304,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.6695,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.33536717311969466,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.6457,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.33990576364220726,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.6762,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.32843308367784046,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.6631,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.33223961737400787,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.6338,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.35501297943771504,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.6236,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.3430432408972357,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.6365,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.38476315719148757,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7256,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.3346137083977566,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.6229,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.3082807963901158,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.6043,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.3136974323360699,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.6522,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.3572327998101803,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.6599,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.3290643907110654,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.6367,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3668339281339755,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.6643,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.3216040123384671,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.5818,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.3381108626702672,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.628,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.3309441524962154,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.5479,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.362723924528029,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.6453,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.3627063433665466,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.6117,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.3305218390152876,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.6173,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.33151598385948866,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.617,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.34691784822633,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.5999,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.2968593137797099,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.5524,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.3033544821112245,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.5948,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.31610195297938865,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.6322,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.34840628740571494,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.625,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.3528847028604991,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.6272,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.34476033038334497,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.6307,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.3090337292429833,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.5959,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.31740805110292,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.5934,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.3424042297072991,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.6204,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.3616309533366346,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.6341,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.32840944549521617,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.6287,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.33634508751845704,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.6501,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.345376073506382,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.675,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.321779898847676,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.6224,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.3317479094471431,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.5951,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.32745902841744945,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.6338,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.3345831834056288,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.5968,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.40885289247034085,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.6411,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.30078215781686657,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.5971,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.32460199442321497,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.6376,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.3533886940677027,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.6903,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.3374240648532365,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.6594,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.3386338355930694,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.641,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3649927062956018,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.6367,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.32354185562358234,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.6185,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.34540969342540034,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.6427,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.3367172613058143,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.6488,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3248751260302771,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.5906,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.3343029055249699,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.6313,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.36348786188614796,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.6579,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.3381220559557464,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.581,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3726975314977943,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.6073,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.32277931313111685,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.6089,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.31838728825263063,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.5844,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.3378246630597595,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.6187,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.28655347766899164,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.5831,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.305738884397888,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.5803,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.33699359311357896,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.5935,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.31150735826778087,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.641,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3054786148140765,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.6593,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.34948532216662775,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.5906,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.33864084582756415,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.6236,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.34634624154896837,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.6826,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.35260995832319136,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.5873,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.33682247133338794,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.6607,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.346085289606286,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.6567,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.3393048241910544,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.6619,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3003583628662106,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.6177,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.34016554903106605,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.6691,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.3093553378225518,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.614,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.33172592867243444,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.6436,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.29544828097432757,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.5766,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.333956977312725,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.6268,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.35148063395313595,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.6259,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.3651031312943574,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.6612,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.326094381969768,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.6068,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.3674698984073789,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.6951,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.3188621148135757,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.6065,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.3710220035442832,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.6886,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.3532434378333304,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.6434,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.35527501155029584,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.6517,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.3916507293055691,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.6503,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.32044950584072485,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.6202,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.32282933301493805,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.6444,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.3133558079994367,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.5897,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.35815970108664874,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.6477,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.310639374075745,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.5964,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.32845584266465505,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.5722,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.31126980443075597,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.5861,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.36275063956995357,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.6681,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.32341296377966344,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.601,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.33371756721528906,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.6371,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.3210595730247353,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.604,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.33978554404755684,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.6232,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.3386691752916447,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.6353,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.36082000765336303,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.6172,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.28688120900525155,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.5644,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.3542563143605929,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.6407,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.33798332974615464,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.5897,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.35372407086058777,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.6274,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.31344756354870335,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.5996,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.336677637693999,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.6402,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.3372768178475663,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.6203,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.2942773640929049,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.6023,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.3522102150676642,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.6351,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.31741912184406945,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.6009,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.31924641317688096,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.6108,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.37053405946908835,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.6431,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.31961598906952976,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.6316,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.3634402137789276,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.651,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.31948674209526967,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.6035,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.299548802883558,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.5926,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.32402448050086785,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.6156,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.33880235096169364,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.6279,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.39083307094174247,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.629,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3219566382994085,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.6479,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.2994626226151748,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.5806,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3232838605331977,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.61,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.3497376990079617,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.5986,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3507562759459691,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.6309,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.37058372545630297,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.6328,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.33061497130522244,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.6104,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.3186501826235579,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.5812,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.41864475687139424,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.6127,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.3205208087991299,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.5832,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.32498438069860025,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.5907,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.37112126992550315,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.6245,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.3046377834490436,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.5908,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.3521974098484267,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.6266,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.3318392213699102,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.6101,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.3328213775760183,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.6411,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3419657527360827,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.6207,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.3499038698126123,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.6674,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.3586849462117495,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.6361,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.34369836364013817,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.6314,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3417683310024924,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.6613,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.3409954866063188,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.6489,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.31603895890078487,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.5854,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.3333999839718448,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.6252,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3069820322086464,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.5925,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.34072642945605036,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.6001,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.35920669108348435,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6175,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.3783316154216495,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.6116,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3371072543398276,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.6039,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.3395864137013351,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.6455,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.33436473615672774,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.6157,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.34399483461043867,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.6388,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.31793307107047525,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.6074,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.33446208035550756,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.6139,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.3571652352841207,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.608,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.3320381930882504,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.5973,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3510144786574423,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.5891,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.3274044045131555,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.6348,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.3269897438226837,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.6086,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.32574199716607716,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.624,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.33312146479263194,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.6109,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.3467304661728986,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.6533,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.2937669247519489,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.5568,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.32697302232779996,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.637,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.33008922674291,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.5947,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.35932702083702056,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.6224,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3488947331373348,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6285,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.33888121269893695,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.5889,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.40010242072052316,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.6248,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.31657595290120183,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.611,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.3319297476858135,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.6418,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.34036386362008975,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.6112,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3523951602730713,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.657,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.31335321558367357,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.6226,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.3013914565914275,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.6288,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.34300227799897226,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.6555,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.30634783522717546,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.5639,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.3425070849108771,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.617,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.30179565706233397,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.5843,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.3296112433632852,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.6033,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.2945134672184559,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.555,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.33776637698122036,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.6066,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.3423568940979264,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.6326,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.32648811706587916,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.6105,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.32912624202267404,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.6173,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.293587610047466,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.5404,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.33688460148914934,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6275,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.3323749410062641,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.6079,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3267444427300877,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6123,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.3436790069140749,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.628,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.32119138424362625,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.6157,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5151733736289191,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.6439,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.33998083278904195,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.5653,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.33423052743474296,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.6382,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.37977710280386257,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.6551,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.32753090231737964,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.6022,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.36698761359491083,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.6158,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.3141226357753015,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.6357,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.32060289780088846,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.5887,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.3286994567711767,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.6043,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.35706097499663475,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.619,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.28066924635940443,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.5691,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.32754973215699557,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.5872,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.3092661920020084,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.6005,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.33436383411458787,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.61,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.34876358910283783,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.6556,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.2999228716581912,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.5667,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.3759584585259726,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.6834,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.29330724303471173,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.5379,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.31247952102174653,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.5804,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3714867722083103,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.5928,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.3331624506824157,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.6082,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.31057364449667263,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6172,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.30513730993747623,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.5898,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.3333233683204336,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.593,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.3718388692445804,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.6643,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3045469786263307,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.5594,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.3400954993676332,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.6452,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.33068944804413763,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.6184,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.3263005697408856,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.6137,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3799361907548381,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.6117,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.2961328400166121,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.5831,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.29802987478257925,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.5767,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.3588158832570962,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.623,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.32042047425999975,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6383,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.32407789664361586,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.6056,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3731581141563738,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.6311,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.33308689757663057,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.5581,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.345094080476923,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.5997,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.3257450772192306,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.6082,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.32214136459012144,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.6195,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.3141660128143383,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.6027,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.33291376077434987,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.5713,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.3456442402866317,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.5992,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.32137711599982044,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.5749,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.3488399941382417,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.6133,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3172833467405452,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.5964,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.3107087680682212,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.6368,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.32346772310238436,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.5702,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.293740028197297,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.583,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3131215213340511,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.6202,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.30852267611758893,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.6273,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.31073136007732777,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.564,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.2890617954917825,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.5852,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.33603207991140516,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.5863,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.32670973873126913,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.5973,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.3196958708549082,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.5903,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.32502039443517705,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.5515,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3214718738126143,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6478,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.332217429288651,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.6234,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.31099973831050426,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.6197,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.2962571050160548,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.594,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3153302209931717,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.5973,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.3311960609985164,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.5776,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.2979953357678523,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.596,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.33082862308923683,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.6185,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.30598496662422275,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6153,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.3280300082629786,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.5917,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.3251028559551198,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.5825,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.3194549680822979,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.6216,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.29653194129442245,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.5765,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.34791285322763277,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.62,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3202293426291375,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6216,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.3174535575241201,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.6061,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3002687444840116,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.5822,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.5331348084291688,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.6379,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.30851128418293494,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.5986,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.31958460369884306,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.6179,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.34091527263286986,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6166,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.2990571469218516,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.5966,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.3436075957977801,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.6326,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.35137030920003226,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.5645,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.33924485422613193,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6184,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.3134749139771607,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.5709,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.3084857998743944,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.5896,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.34836899461435644,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.6443,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3293458971943998,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.5857,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.2931982589027787,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.5243,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.346647579881588,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6187,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.3260771339014491,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.6186,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.33632282615149567,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6216,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.3210763835251143,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.6262,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3129848743701589,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.583,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.35452368332305,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.6261,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.33979598399894534,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6157,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.35015314225851113,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.6385,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.32658054454872115,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.6152,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.315602878773404,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.6062,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.33170259327411067,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.5832,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.3328168923869451,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.5952,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.2978949712686637,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.5474,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.33412115879805643,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.5956,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3172963885067165,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.5803,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.34619898505454627,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.6414,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.34914551735458776,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.5898,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.3406617490116019,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.6064,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3759582987232799,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.5926,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.3550949299611754,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.6484,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3284855272098851,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.6287,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.30199493201393424,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.5935,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3098467180783862,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.5566,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.31183291738971114,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.584,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.31929677431537473,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.6131,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.41978014421811577,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.6011,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.34342195007224435,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.5592,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.30196768181200767,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.5342,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.32016809859995643,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.5513,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.3030961366563467,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.5416,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3640166896343213,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.5939,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.33311721998350496,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.5941,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.3118572958395179,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.5873,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.3524618854868946,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.6004,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.30261631398153654,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.599,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.32222661211801934,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.6179,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.32887621360130437,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.59,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.3081053135399744,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.581,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3104914902426349,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.5823,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.3240593761286958,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.5629,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.2971645909812542,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.5673,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.30765629167391056,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.5686,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.34662720007108755,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.6206,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.32151305481811043,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.6041,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.3026717132828533,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.581,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.3115727768344922,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.6127,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.32325633413163507,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.5831,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.3406702237157173,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.6062,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.3092593810807342,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.5874,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.33249730642091496,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.6137,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.32429643339268305,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.5782,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.2856644756348792,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.5785,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3174505397082634,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.564,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.33154785899386857,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.6121,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.31824223026592074,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6125,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.3425796531184699,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.6277,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.30980473656593255,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.582,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.6076175814566195,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.5652,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.2872005508008041,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.5663,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.2941820265987994,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.5632,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.3736048758448455,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.5783,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.3139318455160298,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.587,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.32914317410037475,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.5746,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.29522944680111907,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.5543,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.3337111411709632,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.5964,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.3343565592312885,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.6095,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3396141654510037,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.5707,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.30097027159385814,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.5541,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.35273747266556094,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6511,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.3148814749827936,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.5839,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3307776757603587,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.559,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.3584458844664954,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.6508,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3438687476549892,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.5832,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.34643015237385527,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.5951,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3004074425788124,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.557,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.3098596835581066,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.6101,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.3154602565169677,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.594,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.31535172683979323,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.5907,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3683800650639519,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.6062,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.35901655003556643,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.5883,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.32146165807925703,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6008,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.33168383613708075,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.6119,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.35702815659174447,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6541,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.2899388905134802,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.5852,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.29446886988340504,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.5675,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.34053821314045435,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.5872,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3350005651489098,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6383,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.3370383042632032,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.5947,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3310474361028596,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.5602,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.3361021628125497,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.5578,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.31765051956438506,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.5777,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.31371062106564285,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.5438,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.33037423863396975,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.5586,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.33639075074149816,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.5955,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.31110248151937986,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.5572,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.3248594213682404,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.5598,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3123099001416393,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.5751,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.33489861794349407,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.5663,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3283720748648807,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6106,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.3275800483403531,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.6041,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3049636997648269,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.5841,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.33112859099247005,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.5874,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.38284859464913196,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.5724,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.3311207733229893,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.5726,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.32156034085457397,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.577,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.33971653595522966,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.5699,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3623093689499001,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6208,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.3290860318979573,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.5553,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3298586814615897,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.5602,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.3036153939407574,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.5632,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3239852509324012,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.5865,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.3293432127865494,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.571,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.31413022714131084,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.5731,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.3233028919849246,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.581,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.29842695942222947,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.5572,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.30617107236458035,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.5748,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.30634148267818595,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.5486,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.33002320870772317,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.5959,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.33549835176562415,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.617,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.39908094186117465,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.5972,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.29611461200931244,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.5694,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.32488354902119576,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.6111,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3289872766327485,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6114,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.37679220427159815,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.6396,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.3170293818278164,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.5848,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.2904013307442176,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.567,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3315032774391374,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.5951,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.3464804958950567,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.6333,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.2956379310063911,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.5845,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.34014894088179815,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.6087,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.31311931547255073,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.594,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.30219795846098496,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.559,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.2945554836924428,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.5372,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.306847991852595,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.5731,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.2947953663893335,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.5374,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.3096735962232439,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.5859,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.3230566622403895,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6206,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.329975913532662,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.5987,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.33390179362794614,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.5591,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.3263508029175923,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.5715,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3015015220251556,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.5694,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.3091036280998343,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.5597,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.33630594584449497,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6166,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.33234679395755995,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.5755,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.2754987158410254,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.532,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.3210960951919637,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.6078,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.32177088310010665,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6197,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.31415194137155206,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.5661,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.35398700420880674,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6036,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.3123690874169846,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.5686,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.31659120941358476,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.5393,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.34287302428913957,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.5907,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.3194427108434081,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.5663,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.28416716013936555,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.5482,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.29934393305943974,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.5566,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.31210441356618257,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.5833,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.4175710823270915,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.57,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.3005431361289694,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.5497,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.30603805485437957,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.5969,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.30839245587096714,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.5743,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.32811301561537237,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.56,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.36282994378736644,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.587,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3073779704203103,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.5662,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.3080445780693838,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.5717,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.3329595020216389,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6073,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.3389768344465552,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.6038,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.2920049809736086,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.5606,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.3329890595277704,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.5881,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.3078511666243944,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.5684,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.2969232568921349,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.5688,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.2975446861211374,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.5866,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.3085150297080228,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.5824,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.31897289948701685,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.5892,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.2761811680872987,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.5269,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3342011446978759,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.5786,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.32936047495463505,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.581,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3310160783493966,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.59,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.3086808688168038,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.5142,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.32294814756946233,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.5676,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.335730313097155,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.5929,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.2891092682415137,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.5862,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.31391514362095485,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.5763,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.34128688111125355,
+      "learning_rate": 0.0001,
+      "loss": 0.5983,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.2970950083492731,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.5636,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3322422790261592,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6336,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.3462889113767833,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.6058,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3091873131073345,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.565,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.3259586772383051,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.5435,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.30355190065994,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.5578,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.3525175334175896,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.6411,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3144319852680261,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.5432,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.3606268812428096,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.592,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.3285048416784418,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.5878,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.3126871249195601,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.552,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.398048211066614,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6313,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.29852580156421366,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.5876,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.29668308123383585,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.5798,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.38923416320875054,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.6116,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3151158278851179,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.5882,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.2915554377465551,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.5781,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3283600604801278,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.5915,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.2892731582576615,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.5378,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.32313775663040456,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.5856,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.3143239975542234,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.5943,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.28093040153755033,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.5538,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.29836527707274535,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.5606,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3034497970344787,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.5709,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.34855905245294183,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.6134,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.28627133463186333,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.5459,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.2912756420621914,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.5624,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.30179944173660356,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.5765,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.29890460764801857,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.5537,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3466282966197182,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6011,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.29223313989202737,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.5451,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3025698887084507,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.5513,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.31970077904393274,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.5726,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.30219260523572566,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.5543,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.3418708829034411,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.5941,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3184434479584035,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.5602,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.2803254943591195,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.5235,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3113324526358792,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.5876,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.32516855337457196,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.6085,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.30435796694491285,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.5414,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.3706669941280122,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.6582,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.2909623001085036,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.5378,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.31528375783476587,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.5713,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3650121672123169,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6142,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.29119371050396164,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.5619,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3203911171044392,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.5877,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.32271298283608196,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.5283,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.28878418893810864,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.5751,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.30789216817595116,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.6049,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.31190512927915737,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.5887,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.29370360358539743,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.5527,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.32282691953164466,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.5787,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.29607470204491804,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.5365,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.28276141290643,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.5233,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.3085936472841523,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.5557,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.30258328345313,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.515,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.3030974741918099,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.529,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3032703363829799,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.5196,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.31717487898699986,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.5342,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3516402124247312,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.5776,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.2981138602796516,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.5745,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3158015784657397,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.5261,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.2882184873804512,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.577,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.2738229806252894,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.5051,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.32973780330976316,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.5888,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.29928667533722103,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.5406,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.3285204465141545,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.597,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.33658694854786897,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.5394,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.30003941393678973,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.5418,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.28418221533922805,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.5221,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.28466767665546777,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.5402,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3109103099273694,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.5595,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.3008493866712516,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.5387,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.30239359653537784,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.5383,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.30888540228832956,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.5662,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3041544703549886,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.5488,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.3095261506372009,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.5889,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.32286086032346434,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6072,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.32136638464728656,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.5359,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.28516728612063824,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.5104,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.3978012159473873,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.5992,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.36043649309480175,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6012,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.29103040749135345,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.5761,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3360190376562365,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.5908,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.3301162761711765,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.5547,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.301639490569475,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.5488,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.3087064983202081,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.5632,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3050631522442395,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.5799,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.3292292803230435,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.5417,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.30277773825080534,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.5659,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.31459609815156425,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.5278,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.31405067993214364,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.5936,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.3261864155110813,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.5373,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3256369222279084,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.5587,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.32732422612088785,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.5412,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.2909075591219546,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.5396,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.3102436144255524,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.5573,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3089849518644942,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.5607,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.3421498615774315,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.5528,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3265352951673599,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.5569,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.32080085841420525,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.5858,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.2728739185226048,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.5312,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.31824075120690043,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.5732,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.34042444345005146,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.5925,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.310324892221723,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.5497,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.32304417812133285,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6018,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.2948690578149887,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.5487,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.27983536162198325,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.5124,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.3548489303896564,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.6156,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.35373252999175986,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6013,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.31108290438872843,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.5537,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.30654349530803887,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.5407,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.27628561537490304,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.4662,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.2848952041422797,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.5305,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.2929566568970297,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.5315,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.29284105716996106,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.535,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.2859049360458969,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.5387,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.3355319801411293,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.5671,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.3203929390909087,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.5538,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3185197867849292,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.579,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.3280413731803322,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.5432,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.356417217277443,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.5501,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.34291843170723035,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.5891,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.33266859183140374,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6053,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.3428756852638353,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.5835,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.32259740512068086,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.5447,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.29520311140254396,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.545,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.31599076092953177,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.5625,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.30610042652307184,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.5232,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.2875556938182346,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.5118,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.31626200677832067,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.6238,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.2899591130536518,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.5142,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.34510295520184053,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6037,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3326034789489528,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.5752,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.32708452711369174,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.559,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.2752132481519014,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.503,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.2898907790607678,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.5365,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.3047104154762272,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.5491,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.2985657914414307,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.5256,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.29121424575479926,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.512,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.32415455341460486,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.5653,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.2966756230431178,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.5506,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3072390598848972,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.5494,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.335230602942832,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.5828,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.31009836683608927,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.556,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.30452383537861544,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.5497,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.38462804061730854,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.5529,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.31250693282575825,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.5535,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.29974803127058314,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.5369,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.2867417866680452,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.5438,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.2997163483151464,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.5556,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.34043546647378375,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.5719,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.29164856643978054,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.5612,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3012834777462016,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.511,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.30801758600288326,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.5636,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.33889818246198006,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.609,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.31475874645443735,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.5595,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.34755245829472137,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.5808,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.27507511346149044,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.5395,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3233144309369947,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.5376,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.29949070972137315,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.5369,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.29285700624899347,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.5022,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.32795453858716095,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.5413,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.30826104843825375,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.568,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.3143327052024473,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.5331,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.31702164905563585,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.5746,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.3121974900153664,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.5598,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3012287251823861,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.5156,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.30979212824941293,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.5661,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.29784433613118455,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.5169,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.30376763893493497,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.5516,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3898317785979762,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.5805,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.3094852084266824,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.5859,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.320339966420363,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.5822,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.32006842269251284,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.5352,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.37750231534770534,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6046,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.36655609158769825,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.5919,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.28623817014447206,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.5285,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.33349397751189286,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6317,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.335647348296861,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.5885,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.3232602598959376,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.5834,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3361254671620294,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.5939,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.31945038832981176,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.534,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.31226624481325327,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.5422,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.3077602752436613,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.574,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3336995198562612,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.5377,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.35153220115023254,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.5564,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.32485718185413065,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.5886,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.30590842283104547,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.5554,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.32672208522717094,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.5595,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.3192783343397264,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.5418,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.30602064229834347,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.5657,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.29687986531333965,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.5329,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.34079652730441223,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6219,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.31774442893558724,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.5756,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.29105110444104043,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.5301,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.28591438949126685,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.4973,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.336860252500957,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.5571,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.28934036088163895,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.5367,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.29943661800798266,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.5513,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.3392111301843805,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.5423,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3107349966613786,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.5184,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.2979788532115799,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.5369,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.29924739594845984,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.5513,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.33469631034634045,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.5512,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.29595906303526687,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.5319,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.30291502645029644,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.5502,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.35328192530331826,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.5762,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.28540925455822486,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.4979,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.304605777689829,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.5337,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.2729018856144321,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.5108,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.30020507460061946,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.5207,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.34238623815047864,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.5534,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.35003527619217056,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.5984,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.32561025592189263,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.5739,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.33034226924047594,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.5347,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.2792723221355915,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.5258,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.3044656271243137,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.517,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.3360880809613938,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.5362,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3200015230925606,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.5639,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.40887674834776866,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.5625,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3145436452910505,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.5451,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.31880844342506026,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.5465,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.34586690000736525,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.5597,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.3121607435832115,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.5423,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.29565589950502735,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.5499,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.3133910880222989,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.5773,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.30250808339080254,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.5252,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.31168795673266836,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.5758,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.30860647715999096,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.5537,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.30206562614258453,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.5735,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.29150087234273553,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.5506,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.30932251242575176,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.5691,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.34688608229870943,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.5896,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.299346299652687,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.5422,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3134175521993939,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.5478,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.25714295747613014,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.4925,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3028365511780001,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.536,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.32235312108259556,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.5728,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.32081281191842137,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.5634,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.3827895095206315,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.6354,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.30863741089331764,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.5537,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.3256260881947753,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.5867,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3050446437744646,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.5361,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.3274713332029523,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.5497,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3189105663709932,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.5626,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.2798343479189164,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.516,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.2954774076825385,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.512,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.3147840267355935,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.5566,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3032627309661228,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.5088,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.31170713223938146,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.5535,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.32461281893145943,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.5957,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.2847997799830815,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.4989,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.33089303204648524,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.5405,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.29905218250378424,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.556,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.31422611021653635,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.5443,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.31144595157979055,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.519,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3123499837096723,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.5426,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.3459777573293554,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.5925,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3175452941693634,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.5195,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.30567617639153344,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.5279,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.35116963657254624,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.5305,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.28375567723560385,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.5184,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.2869409192199368,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.5551,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.29937788488621553,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.4832,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3262491820167027,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.5378,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.30112379649038384,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.5688,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.29810789933135196,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.5481,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.31552908175115096,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.5245,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.29393333704746166,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.488,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.31391098423307,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.5536,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3346782470622131,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.5329,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.3672665593378921,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.5678,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.2944453587769173,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.5462,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.3370399550819899,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.5628,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3288714841640314,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.5981,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.29689548182034403,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.5266,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3080060985178883,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.4876,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.29239362748621933,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.5039,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.34189268844884885,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.5757,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.30140273049519617,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.5666,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.312712966702646,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.5343,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.32469996843838494,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.5954,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.31158861396211135,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.5284,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.3399520386722396,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.5834,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.31447096265182917,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.5595,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.3295635813346338,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.5963,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.31492787825694496,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.5387,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.34873012265855585,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.5469,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.2882059407031955,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.526,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.357003593863277,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.5742,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3059292031158361,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.5542,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.34228930243021916,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.5418,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.31358476760363263,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.5515,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.2991049253730696,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.5499,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3246964150268922,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.5379,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.2800061952399545,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.492,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.2953120642075083,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.5296,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.3245840047249902,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.5415,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3082445800103548,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.501,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.32292629991709976,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.4635,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.299094795236697,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.4686,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.3508599466147304,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.5786,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3171071081622717,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.5668,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.28065758877521735,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.5089,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.32489832926900575,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.5173,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.3095537494019464,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.546,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.32034370572961807,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.5171,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.3187215992671766,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.5504,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3146826672456126,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.4789,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.3190013611747521,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.5321,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.326748839285513,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.5294,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.3081416312905957,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.5232,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.29513374549201843,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.5401,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.2773408705908564,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.4806,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.2798973656042734,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.4714,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.30574862160174193,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.4972,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.32748214795140573,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.5661,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.3252553366736222,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.5583,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.35670821112923623,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.5849,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.2871837048418551,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.5328,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.31323911862742787,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.535,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.32004837470051534,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.54,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.30508189378285194,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.5241,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.28600891039631177,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.5407,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.2685474416983108,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.5263,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.3288666824268659,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.5633,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3279722764357726,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.5567,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.30724088779643627,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.526,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3075719636900623,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.5779,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.3264675500929435,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.5627,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3487082024475743,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.5495,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.30440794115951253,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.5452,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.32424742144960367,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.5304,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.38630918402153647,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.5589,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.2915176550129016,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.5208,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.3292020277917887,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.5611,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.32495013533889283,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.5448,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.3282110627791997,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.5604,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.28993296253098433,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.4881,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.33432515559265674,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.5635,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.31056272504625193,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.5052,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.2784522543426241,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.5279,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.28274294573486125,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.5471,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.3666468314245622,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.5479,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3077175023277414,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.5127,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.3251709821723278,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.583,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.30886321887992296,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5752,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.27366018678935805,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.5128,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3000169994037466,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.4773,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.3300365338724269,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.5475,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.2875891880642037,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.5044,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.3142456816575401,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.5518,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.3506197385399373,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.5615,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.32899475302997194,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.5667,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3056242008956224,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.5245,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.28563965971737626,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.5152,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.2870864372747718,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.5015,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.3373663391946109,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.5638,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3496232984165067,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.5595,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.28995611594609594,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.4965,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4967736974804734,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.5359,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.3453416340275592,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.5437,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3123927141691169,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.554,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.3543812860374282,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.5634,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.30432269982327737,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.5288,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.3003660393916413,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.5476,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3591173722638707,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.5912,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.3762179489067089,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.5701,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.3156922850378513,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.5274,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.31274555421611044,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.5137,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.30169419740639275,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.514,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.2782830158634876,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.489,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3304509830037468,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.555,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.296451342339957,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.5328,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.31534851649038614,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.5541,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.3084359553582928,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.5328,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.341562199601936,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.5575,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.35015201530763834,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.6025,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.2993997403965686,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.5233,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.29982110762334596,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.51,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.3019064768245265,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.542,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.35085498091047945,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.5788,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.29740523699437377,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.4931,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.2876930635912273,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.4932,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.31294293634047604,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.5826,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.3068928309390789,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.5346,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3100798430096716,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.5196,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.32708364181786515,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.5521,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.3207712088485768,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.524,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.3660178711580096,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.5655,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3143621876194328,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.5449,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.31830508951267983,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.5385,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.27040696747271814,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.4921,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.31288584178094314,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.502,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.28642818157187094,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.5245,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.3491942090381071,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.5499,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.29823801708556746,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.5234,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.30719005009070094,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.5749,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3062795801855541,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.5825,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.27635298150584786,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.5003,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3355786568989501,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.5482,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.2932756874456343,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.5011,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3269210423789285,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.5726,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.29664305615872355,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.535,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3279135895836481,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.5337,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.3023263703012834,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.5241,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.31453396197595923,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.5706,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.32532182414773664,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.5738,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.35771452622909367,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.5646,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.3191261928779677,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.5559,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3157418636170192,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.5253,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.36085371540844025,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.5481,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3003756422947911,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.5362,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.2817775041106162,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.4897,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.28590694646675563,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.4885,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.30886813713061007,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.5272,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.28847217084933324,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.5103,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.558607761663773,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.5394,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.29232474305564354,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.5071,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.40299383279271805,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.5773,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.299439992576117,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.5126,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.30741883936624265,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.5131,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3823429238720938,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.5459,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.35231467984482157,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.5927,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.31776257432998906,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.5432,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.3102367246725002,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.5321,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.2952381694304628,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.5259,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.3203347190740959,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.5375,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.33453341696425476,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.5545,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.2886438992098675,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.5063,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.2957922466846594,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.5138,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.32379520985731924,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.527,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.34378734225924573,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.5904,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.31208605637507303,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.5311,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.34496199350844126,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.5527,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.3110318604047679,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.5323,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.3069235571145225,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.5284,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.294701697184567,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.4935,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.34120922974979584,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.5729,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.3382073855820592,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.5626,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3207381770313093,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.543,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.33639489656827687,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.5768,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3164902799063137,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.5852,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.3269986614052039,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.5584,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3401491702230297,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.5875,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.301546032734985,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.5155,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.29486548220063846,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.5409,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.28913316993391175,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.5164,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.2951028071209438,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.524,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.32268092430664863,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.5112,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.29083850843653986,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.5187,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.33878533738514793,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.5579,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3583612786949319,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6055,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.3157374969540624,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.5193,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3323887403847907,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.5483,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.3271313369536848,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.5291,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.33490858014078323,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.4985,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.398077948712715,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.5612,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.31927806478811827,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5282,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.31620075198738284,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.5373,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3140137639251144,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5141,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.30376308756664244,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.533,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3050295266759193,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.4898,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.29951904118841577,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.5268,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.34540743633115883,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.567,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.30231064759206816,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.519,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.316037695232489,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.4763,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.31721374711969536,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.5215,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3333723365925436,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.5361,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.2988022996120898,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.5202,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.313165965383161,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.5052,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.3133261195918503,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.5021,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.3310204849166293,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.5449,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.28642784716383,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.4939,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3231255023681699,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.5071,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.3330966662426535,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.5205,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.3013409817811988,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.5063,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.2906630284974183,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.5013,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.31316666023740375,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.5215,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.3180187491635761,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.5432,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.28744382137160857,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.503,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.30423697755815965,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.5339,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.28959749985331457,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.5177,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.30651468374097873,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.5003,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3399118692018144,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.505,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.30229537171643533,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.5135,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.2980808766300904,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.5157,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.32109007310226906,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.5441,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.31518895500804905,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.5398,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.30746141207362454,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.5138,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.29844474026185125,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.5127,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.31372953790829117,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.5267,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.31513266463882467,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.5441,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.3373577735158178,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.5549,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.33313158820733985,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5365,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.3244540875818248,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.514,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.33794058251637915,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5078,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.2923761625384951,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.5028,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3083872375206693,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.5173,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.37042508296412585,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.5302,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.30268944873115844,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.5077,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.3299826370842139,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.5566,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.32169935120948345,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.5135,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.3154373318031867,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.5457,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.2928616342677835,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.4758,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.3255590348687943,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.4986,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.36347369876321284,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.5508,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.31922620403351204,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.5499,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.2806988390920151,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.4693,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.31126016115643457,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.5503,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.28994395860031663,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.525,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.3154581960565633,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.5432,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3176066044180408,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.5437,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.3326477947403509,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.4988,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.45512219273453286,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.568,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.3278966468396552,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.5401,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.323752718264929,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.5268,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.31816355397904306,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.5268,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.31568343150732775,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.5798,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.2918941726574028,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.5028,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3364645819786627,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6063,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.3252749199764253,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.5074,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.32307432124495544,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.537,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.31387578239447933,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.5442,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.304637618589797,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5397,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.41468661451644506,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.4844,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.302638682703656,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.4872,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.28655926896016676,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.475,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.30358370328840534,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.5041,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.28478468702495974,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.5192,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.32519038445347415,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.5586,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.2993081248315083,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.5037,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.2958894909202451,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.4983,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.3142134015167779,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.5111,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.31999967793391854,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.5191,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.31633486555340273,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.5351,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.30675019805018006,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.5149,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.3035224544365652,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.513,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3205777006087568,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.537,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.3453851337910027,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.5138,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.3110439277309376,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.538,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.2900825779303158,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.5188,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3131984751420794,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.5088,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.3466175063672184,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.5194,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.32478119070806216,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5155,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.2999668283208606,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.5018,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3134414290580692,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.569,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.2953094923511177,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.4756,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.34344512875367766,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.5455,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.30558827977054226,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.5149,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.31275905263153886,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5286,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.28499175183291087,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.4876,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4808999903189011,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.538,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.31226313414768764,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.5459,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.29523397120763456,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.486,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.3520154899027078,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.5364,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.37354671545114215,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.5482,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.30755601609273053,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.5028,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.2939498097443535,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.5072,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.2746461418963455,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.4635,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.34560038360849,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.5528,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.9249883599597586,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.5623,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3660112464881137,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.5154,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.3238829544216669,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.5607,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.31085999125447755,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.5063,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.30117998298835985,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.5451,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.31234562961090084,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.5213,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.3358034693770683,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.5301,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3173787926469782,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.5645,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.4146473527673543,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.5677,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.2866437503923634,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.4946,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.2943197456246717,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.5112,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.30535385104586243,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.5516,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.3205778331850039,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.54,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.36322728698820905,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.5663,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.3339515307274971,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.5173,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.25378772151745665,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.4441,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.30785812626660675,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.5446,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3283730943219699,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.5399,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.3035437942206902,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.5246,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.34024560448906616,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.5227,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.3331008664020577,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.4932,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.2938886631788762,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.5289,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.30285455598285027,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.5553,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.32815425365861317,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.5341,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.29776320315837895,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.5015,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.29765736304272533,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5149,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.36027512949995577,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.5111,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.30288619177488824,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.5116,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.3103546766773175,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.5231,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.29366551229568166,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.4787,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.3165439339426244,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.5203,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.3233906599138733,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.5466,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.31714449024996777,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.517,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3023578455209064,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.5351,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.2993120067816697,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.5218,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3121313135121532,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.5376,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.29916412187483066,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.4909,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.30620573314273947,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.5024,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.3254168132789596,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.5706,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.29745520661849084,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.4772,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.30232853467614984,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.5229,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.30328558964827984,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5235,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.3091051560807406,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.5267,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3204451227014326,
+      "learning_rate": 0.0,
+      "loss": 0.5408,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1262987898191872.0,
+      "train_loss": 0.5855974884986878,
+      "train_runtime": 20106.5862,
+      "train_samples_per_second": 0.995,
+      "train_steps_per_second": 0.062
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1262987898191872.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7644c362e723a4b41037abb906be799d6d33bd2
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "gate_proj",
+    "q_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..651aeba4abfdb6710a1b953b7d6c469fc10534eb
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:483c779bb16880a64bc2d1efce90d92f99570391c506bc6e5ea3da406b2d2179
+size 671150064
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0bf2a2688f8a97641d0462fdc0dddb272604c27b
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da8e960957e2137b6feb2287acfa4440b3e6fd8f7293c07327837d10a9bc134b
+size 918507402
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e22847176eb6f46cc078bbbf5646872348c4f46d
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7897449163773217,
+      "learning_rate": 5e-05,
+      "loss": 1.0158,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7286029077519581,
+      "learning_rate": 0.0001,
+      "loss": 0.9785,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5007841633835526,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8868,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.551066986410025,
+      "learning_rate": 0.0002,
+      "loss": 0.869,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.7067240936501759,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 0.8414,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.44664188050548564,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 0.7815,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4193441176930537,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.804,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.41306296762126027,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.7387,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.435261450186937,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.7559,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.41019140182818303,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.7582,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.43892155428418417,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.7391,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.45186235095684296,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.7633,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.44124204480242757,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.7436,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.3664602718616742,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.7012,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.36386121096582563,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.7551,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.35587801041616296,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.7146,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.34427088443737364,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.6675,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.35523064882833827,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.7573,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.36215020830520817,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.7106,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3397367701549374,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.6803,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.41634440568087716,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.6785,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.38124851642853896,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.7187,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.38564705024675255,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.7416,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3848627816158859,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.7437,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.39332044646176956,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.7776,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.3884802424868953,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.7332,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.3428409307296526,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.7081,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3303519063768321,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.6989,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.30598505514605384,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.6628,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4709929717685942,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.7294,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.3607051665952329,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.7177,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3396445814711014,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.6752,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3843901133072089,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.7306,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3209838157061621,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.6754,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3157296469039439,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.7055,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3636809922566001,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.755,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.33887530853673087,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.6715,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3139532292851415,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.6737,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.3887953669372525,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.7348,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.34671023775518583,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.6969,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4055661542804412,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.72,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3281789122262739,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.6991,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.33527063756600683,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.6772,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.34628576129023203,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.6718,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.32625551196869307,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.6714,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.33248202305747837,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.6676,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.3203522050028239,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.6786,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.38431418470170736,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.6839,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.3543522461020586,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.6793,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3938030668231684,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.7094,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.3434735143890009,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.6976,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.33074261951215367,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.6408,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.325639624418813,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.6598,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.32277191745270095,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.6417,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3469996270480694,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.6502,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3222945210391576,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.6656,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.358944803881362,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.6622,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3726202507097444,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.6961,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.39039592382246363,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.6898,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3155957271966707,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.6199,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.3593109426693744,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.6475,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3369009994062507,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.6385,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.34306269167543985,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.6595,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.308051310722022,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.6155,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3352389931983909,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.6081,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3543975811840973,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.6623,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3002191119803479,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.6203,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3164322589890033,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.653,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.31243198237928543,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.6454,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.32215726035020803,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.6572,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3013215539057925,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.6145,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3315404842154109,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.656,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3258222002707797,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.6498,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.31360991756541984,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.6379,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3177067712810427,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.6955,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3317732798005526,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.6479,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.28782041118815993,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.5791,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3181330408066894,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.6304,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.3105640994734434,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.6146,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.31041728231206744,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.6782,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.28998618193622805,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.6205,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3015857735322906,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.6126,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.32714747306023156,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.6886,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3259955110435763,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.6559,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.35745803891368194,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.6654,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.28442451383341705,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.6287,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.31498875825303785,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.6631,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.35372019891399126,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.6475,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3331501046613709,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.6494,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.28067458269386936,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.5944,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.37713100766680624,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.6791,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.31509292608707157,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.6719,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3152963433402979,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.6616,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.31293005737263796,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.6147,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3092685519717373,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.6758,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3533787649261356,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.6105,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.34853704155291676,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.6801,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3219342935342763,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.6127,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3050310948436688,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.6367,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.31232129631117883,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.6458,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3140181650190237,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.6416,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3201000886253217,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.6114,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3156734020633334,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.5782,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2933312644780593,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.6489,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.300575715048929,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.6312,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3283711441741722,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.6499,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.34622150730617574,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.6292,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3358034751151104,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.6374,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.291595756315806,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.6187,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3385592583782257,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.6704,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3211302101059581,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.635,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.32649921097581136,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.6635,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3264709794087455,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.647,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3300373343975954,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.631,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.32177072851179134,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.629,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3158053643511603,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.6011,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.2967406337957916,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.6299,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.34898701539844956,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.6639,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.2935252997159363,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.6434,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3056888738443196,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.6242,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.28229812547022365,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.5963,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3043090639132224,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.6056,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.3209412637695787,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.6676,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3293937488127893,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.684,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3203293853257442,
+      "learning_rate": 0.0,
+      "loss": 0.6528,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 122223981690880.0,
+      "train_loss": 0.6799252457618713,
+      "train_runtime": 2005.2448,
+      "train_samples_per_second": 0.997,
+      "train_steps_per_second": 0.062
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 122223981690880.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b099c2177afe74fbee058fa2f26dff7569a57566
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b0a3dd656d69a633da22888c7ebd9e1470f349c2
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:651813e02a6e841f95aead8d7be4dc00f7054c6c7d118943b8eb39c0e338b841
+size 671150064
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f22ace60b438e6f004b60dcc70a9360993270bbc
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fba9a24d2cb3e8de37aa61b86e709f5084e1df61cfdeac00092e390771e9b32a
+size 918507402
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d39418ae70f790557bb5b5429f93d25b9da979d
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/trainer_state.json
@@ -0,0 +1,1792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8103855719925606,
+      "learning_rate": 2.5e-05,
+      "loss": 1.0158,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.749041337221085,
+      "learning_rate": 5e-05,
+      "loss": 0.9785,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6098257778772503,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 0.9223,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.45221154229482324,
+      "learning_rate": 0.0001,
+      "loss": 0.8843,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6036953220555253,
+      "learning_rate": 0.000125,
+      "loss": 0.8286,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.620523806065544,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.815,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4957034078191343,
+      "learning_rate": 0.000175,
+      "loss": 0.8283,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.42028357018114476,
+      "learning_rate": 0.0002,
+      "loss": 0.7462,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.4560597515326352,
+      "learning_rate": 0.0001999915737775817,
+      "loss": 0.7616,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.41835878738687804,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 0.767,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4519080647788084,
+      "learning_rate": 0.00019992417251814282,
+      "loss": 0.7513,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4374793073145124,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 0.766,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4322774659638822,
+      "learning_rate": 0.0001997894154323911,
+      "loss": 0.7468,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.38086545856215315,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.706,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.39822175757488365,
+      "learning_rate": 0.0001995873933559535,
+      "loss": 0.7625,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3725703988716922,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.7136,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.3478374330159542,
+      "learning_rate": 0.0001993182424657285,
+      "loss": 0.6671,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.36446041603835655,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.7601,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.36694244434340323,
+      "learning_rate": 0.0001989821441880933,
+      "loss": 0.7126,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.34083750152431197,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.6833,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.36394234532749736,
+      "learning_rate": 0.0001985793250766098,
+      "loss": 0.6821,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3709212631288505,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.7241,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.36975396353372153,
+      "learning_rate": 0.00019811005665931205,
+      "loss": 0.7421,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3740507599475808,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.7452,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4306403705259776,
+      "learning_rate": 0.0001975746552556772,
+      "loss": 0.7769,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.362436854577378,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.7345,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.34354980264780216,
+      "learning_rate": 0.0001969734817634044,
+      "loss": 0.7058,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.34800648439170206,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.6982,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3053541603133724,
+      "learning_rate": 0.00019630694141514464,
+      "loss": 0.663,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3575768848594637,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.7314,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.36309312304789304,
+      "learning_rate": 0.0001955754835053459,
+      "loss": 0.7163,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3265601653767092,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.6771,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3856001462608568,
+      "learning_rate": 0.0001947796010873974,
+      "loss": 0.7349,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3291353426839004,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.6753,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.32706839904535945,
+      "learning_rate": 0.0001939198306412775,
+      "loss": 0.7078,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.35810709497865256,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.7593,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.35027802088238474,
+      "learning_rate": 0.0001929967517119289,
+      "loss": 0.6751,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3319865050168186,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.6748,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.3531139938876366,
+      "learning_rate": 0.0001920109865186052,
+      "loss": 0.7372,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.35271327408605185,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.6992,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.41785713317021467,
+      "learning_rate": 0.00019096319953545185,
+      "loss": 0.7252,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.328699243494675,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.7003,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.3381449706301949,
+      "learning_rate": 0.00018985409704360456,
+      "loss": 0.6796,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.37297787530769627,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.6778,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.379159875115435,
+      "learning_rate": 0.00018868442665510678,
+      "loss": 0.6713,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3377589122295175,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.6695,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.31996790180194656,
+      "learning_rate": 0.00018745497680896722,
+      "loss": 0.6832,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3540049565362026,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.6877,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.33989354564300944,
+      "learning_rate": 0.0001861665762396974,
+      "loss": 0.6877,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3824615978075444,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.7122,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.37033212364615214,
+      "learning_rate": 0.00018482009341868697,
+      "loss": 0.7023,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3542820245473082,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.6467,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3389377943078783,
+      "learning_rate": 0.00018341643596879367,
+      "loss": 0.6637,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3417210956425829,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.6436,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3381593766262924,
+      "learning_rate": 0.00018195655005254273,
+      "loss": 0.6526,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.33860874870666346,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.6693,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3400023167966099,
+      "learning_rate": 0.00018044141973434758,
+      "loss": 0.6654,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.37388798138190976,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.7037,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.3502639926183974,
+      "learning_rate": 0.00017887206631718203,
+      "loss": 0.6953,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3292362366547009,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.6236,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.36883404655464086,
+      "learning_rate": 0.00017724954765415137,
+      "loss": 0.6536,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3510875420258689,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.6439,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.3634644898935124,
+      "learning_rate": 0.00017557495743542585,
+      "loss": 0.6664,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.31941526418984295,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.6234,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3347914844533306,
+      "learning_rate": 0.00017384942445101772,
+      "loss": 0.6135,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.343437021079617,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.6676,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.31238707340906624,
+      "learning_rate": 0.00017207411182989832,
+      "loss": 0.6299,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3231974475690498,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.6653,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.31339344001858105,
+      "learning_rate": 0.00017025021625596853,
+      "loss": 0.6518,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.32723978959602745,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.6671,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3091424191482883,
+      "learning_rate": 0.0001683789671614107,
+      "loss": 0.619,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3437405961553586,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.6604,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3379880167585152,
+      "learning_rate": 0.00016646162589796615,
+      "loss": 0.6602,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.33743544509510187,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.649,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.34455164058166915,
+      "learning_rate": 0.00016449948488669639,
+      "loss": 0.7012,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.325758750535915,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.6624,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.29045691337733437,
+      "learning_rate": 0.00016249386674680184,
+      "loss": 0.5921,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.32785744667852906,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.6415,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.29899909920728357,
+      "learning_rate": 0.00016044612340408466,
+      "loss": 0.6226,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.32325512818097923,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.6867,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.31602836762331643,
+      "learning_rate": 0.00015835763517965673,
+      "loss": 0.6342,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.34250938844506945,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.626,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3424939486401721,
+      "learning_rate": 0.0001562298098595078,
+      "loss": 0.6957,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.334417096892308,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.6675,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.46076813754051577,
+      "learning_rate": 0.00015406408174555976,
+      "loss": 0.6749,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.29210933308142606,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.6409,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3528923975825312,
+      "learning_rate": 0.00015186191068884775,
+      "loss": 0.6765,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.324799963637437,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.6609,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3088789930704027,
+      "learning_rate": 0.00014962478110547918,
+      "loss": 0.66,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2854688168520043,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.6035,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.42689653734470706,
+      "learning_rate": 0.0001473542009760343,
+      "loss": 0.6938,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.35651456548401894,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.6838,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.33635785287045716,
+      "learning_rate": 0.0001450517008290827,
+      "loss": 0.6671,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.336852572915818,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.6317,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.33576829692825366,
+      "learning_rate": 0.00014271883270950073,
+      "loss": 0.6805,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3329231732565777,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.6249,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3598961646096611,
+      "learning_rate": 0.00014035716913228568,
+      "loss": 0.6907,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3115561422500585,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.6239,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3459018767378786,
+      "learning_rate": 0.0001379683020225714,
+      "loss": 0.6466,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.32457665510352385,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.6533,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.33508111254965955,
+      "learning_rate": 0.00013555384164256048,
+      "loss": 0.6509,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3269052404706792,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.6048,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.30770998047308523,
+      "learning_rate": 0.00013311541550609565,
+      "loss": 0.5852,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.31424672359407035,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.6484,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.43876915582562553,
+      "learning_rate": 0.00013065466728160252,
+      "loss": 0.6495,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.34563580586225817,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.6577,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3572647064435535,
+      "learning_rate": 0.00012817325568414297,
+      "loss": 0.6246,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3522582386523308,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.635,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.2967086734627664,
+      "learning_rate": 0.00012567285335732633,
+      "loss": 0.6269,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.34166098841794695,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.6769,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.35539661438628445,
+      "learning_rate": 0.00012315514574583113,
+      "loss": 0.6341,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.33161189448182593,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.6723,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.338951247552023,
+      "learning_rate": 0.00012062182995929882,
+      "loss": 0.6582,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3116039910821697,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.6347,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3222821987139117,
+      "learning_rate": 0.0001180746136283638,
+      "loss": 0.6295,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3180175846318794,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.6105,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.30937980648503693,
+      "learning_rate": 0.00011551521375359206,
+      "loss": 0.6388,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3642307008353836,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.6686,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.322639383880906,
+      "learning_rate": 0.00011294535554810354,
+      "loss": 0.6501,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3162171243639988,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.6274,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.28758115580861665,
+      "learning_rate": 0.00011036677127465889,
+      "loss": 0.6027,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3073842474256611,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.6139,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.32310515303670384,
+      "learning_rate": 0.00010778119907799398,
+      "loss": 0.6554,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.32596935361275065,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.6884,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3312656057889271,
+      "learning_rate": 0.00010519038181318999,
+      "loss": 0.6653,
+      "step": 125
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.30999248898809556,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.5014,
+      "step": 126
+    },
+    {
+      "epoch": 1.016,
+      "grad_norm": 0.2763160568966665,
+      "learning_rate": 0.00010259606587086783,
+      "loss": 0.4397,
+      "step": 127
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.27788323256482617,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.4393,
+      "step": 128
+    },
+    {
+      "epoch": 1.032,
+      "grad_norm": 0.3150059690847303,
+      "learning_rate": 0.0001,
+      "loss": 0.4681,
+      "step": 129
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.31481102781455156,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.4353,
+      "step": 130
+    },
+    {
+      "epoch": 1.048,
+      "grad_norm": 0.4709701646556055,
+      "learning_rate": 9.740393412913219e-05,
+      "loss": 0.4882,
+      "step": 131
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.4116226720028693,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.4502,
+      "step": 132
+    },
+    {
+      "epoch": 1.064,
+      "grad_norm": 0.3862376767849978,
+      "learning_rate": 9.480961818681004e-05,
+      "loss": 0.4818,
+      "step": 133
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 0.3930610489851894,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.4577,
+      "step": 134
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.3576815014862209,
+      "learning_rate": 9.221880092200601e-05,
+      "loss": 0.4423,
+      "step": 135
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.3513091515896804,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.4169,
+      "step": 136
+    },
+    {
+      "epoch": 1.096,
+      "grad_norm": 0.351105537272752,
+      "learning_rate": 8.963322872534114e-05,
+      "loss": 0.4212,
+      "step": 137
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 0.36779752955190265,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.4522,
+      "step": 138
+    },
+    {
+      "epoch": 1.112,
+      "grad_norm": 0.3904869721114291,
+      "learning_rate": 8.705464445189647e-05,
+      "loss": 0.491,
+      "step": 139
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.330258132098933,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.4438,
+      "step": 140
+    },
+    {
+      "epoch": 1.1280000000000001,
+      "grad_norm": 0.37964928396907405,
+      "learning_rate": 8.448478624640797e-05,
+      "loss": 0.4462,
+      "step": 141
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.3397159120181284,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.4206,
+      "step": 142
+    },
+    {
+      "epoch": 1.144,
+      "grad_norm": 0.34881844600691564,
+      "learning_rate": 8.192538637163621e-05,
+      "loss": 0.4341,
+      "step": 143
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.36104875179096,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.4276,
+      "step": 144
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.3300081995494602,
+      "learning_rate": 7.93781700407012e-05,
+      "loss": 0.4197,
+      "step": 145
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.3773761135278097,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.4228,
+      "step": 146
+    },
+    {
+      "epoch": 1.176,
+      "grad_norm": 0.3677726783865881,
+      "learning_rate": 7.684485425416888e-05,
+      "loss": 0.4308,
+      "step": 147
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.4039348666723891,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.4643,
+      "step": 148
+    },
+    {
+      "epoch": 1.192,
+      "grad_norm": 0.3977290252315367,
+      "learning_rate": 7.432714664267373e-05,
+      "loss": 0.4652,
+      "step": 149
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.39042245840535644,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.4417,
+      "step": 150
+    },
+    {
+      "epoch": 1.208,
+      "grad_norm": 0.3546136856499018,
+      "learning_rate": 7.182674431585704e-05,
+      "loss": 0.4112,
+      "step": 151
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.35726383327350786,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.417,
+      "step": 152
+    },
+    {
+      "epoch": 1.224,
+      "grad_norm": 0.35997235150118934,
+      "learning_rate": 6.934533271839752e-05,
+      "loss": 0.4269,
+      "step": 153
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.38548264483214323,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.467,
+      "step": 154
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.3663689248918521,
+      "learning_rate": 6.688458449390437e-05,
+      "loss": 0.4469,
+      "step": 155
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.34663568156293084,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.4111,
+      "step": 156
+    },
+    {
+      "epoch": 1.256,
+      "grad_norm": 0.36455028785765153,
+      "learning_rate": 6.444615835743955e-05,
+      "loss": 0.4388,
+      "step": 157
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.35674055617897765,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.4301,
+      "step": 158
+    },
+    {
+      "epoch": 1.272,
+      "grad_norm": 0.3158129223428509,
+      "learning_rate": 6.203169797742861e-05,
+      "loss": 0.3943,
+      "step": 159
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.37730456344078195,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.4303,
+      "step": 160
+    },
+    {
+      "epoch": 1.288,
+      "grad_norm": 0.3600002465418534,
+      "learning_rate": 5.964283086771435e-05,
+      "loss": 0.4036,
+      "step": 161
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 0.411983597945873,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.4323,
+      "step": 162
+    },
+    {
+      "epoch": 1.304,
+      "grad_norm": 0.5646047576197016,
+      "learning_rate": 5.728116729049928e-05,
+      "loss": 0.41,
+      "step": 163
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.3799713515892477,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.435,
+      "step": 164
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.3701436657920335,
+      "learning_rate": 5.4948299170917325e-05,
+      "loss": 0.4382,
+      "step": 165
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.43034822774042925,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.4385,
+      "step": 166
+    },
+    {
+      "epoch": 1.336,
+      "grad_norm": 0.3396215340957992,
+      "learning_rate": 5.26457990239657e-05,
+      "loss": 0.4132,
+      "step": 167
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.3372682640636405,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.4014,
+      "step": 168
+    },
+    {
+      "epoch": 1.3519999999999999,
+      "grad_norm": 0.3420013343555165,
+      "learning_rate": 5.0375218894520834e-05,
+      "loss": 0.4308,
+      "step": 169
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.36313664255414346,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.4241,
+      "step": 170
+    },
+    {
+      "epoch": 1.3679999999999999,
+      "grad_norm": 0.38860897284885565,
+      "learning_rate": 4.813808931115228e-05,
+      "loss": 0.4001,
+      "step": 171
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.3996408209451103,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.45,
+      "step": 172
+    },
+    {
+      "epoch": 1.384,
+      "grad_norm": 0.35352369486178004,
+      "learning_rate": 4.593591825444028e-05,
+      "loss": 0.4104,
+      "step": 173
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.34130167364663144,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.3857,
+      "step": 174
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.39244887671048206,
+      "learning_rate": 4.377019014049223e-05,
+      "loss": 0.4486,
+      "step": 175
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.4167715151074163,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.4404,
+      "step": 176
+    },
+    {
+      "epoch": 1.416,
+      "grad_norm": 0.3735006311923496,
+      "learning_rate": 4.164236482034327e-05,
+      "loss": 0.4534,
+      "step": 177
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.38617905049661294,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.3994,
+      "step": 178
+    },
+    {
+      "epoch": 1.432,
+      "grad_norm": 0.3261675968534702,
+      "learning_rate": 3.9553876595915375e-05,
+      "loss": 0.4156,
+      "step": 179
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.35742950700768167,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.4105,
+      "step": 180
+    },
+    {
+      "epoch": 1.448,
+      "grad_norm": 0.36928757386227556,
+      "learning_rate": 3.750613325319817e-05,
+      "loss": 0.4323,
+      "step": 181
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.3868031899079498,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.4234,
+      "step": 182
+    },
+    {
+      "epoch": 1.464,
+      "grad_norm": 0.32848087629552325,
+      "learning_rate": 3.550051511330361e-05,
+      "loss": 0.3961,
+      "step": 183
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.3427179300809076,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.4109,
+      "step": 184
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.3668791005619767,
+      "learning_rate": 3.3538374102033866e-05,
+      "loss": 0.4141,
+      "step": 185
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.39299921470783666,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.4462,
+      "step": 186
+    },
+    {
+      "epoch": 1.496,
+      "grad_norm": 0.3869815292768112,
+      "learning_rate": 3.1621032838589305e-05,
+      "loss": 0.4061,
+      "step": 187
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.4097815530872006,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.4704,
+      "step": 188
+    },
+    {
+      "epoch": 1.512,
+      "grad_norm": 0.37871448435448984,
+      "learning_rate": 2.974978374403147e-05,
+      "loss": 0.429,
+      "step": 189
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.3511999133515402,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.4109,
+      "step": 190
+    },
+    {
+      "epoch": 1.528,
+      "grad_norm": 0.3731155678723334,
+      "learning_rate": 2.7925888170101665e-05,
+      "loss": 0.4061,
+      "step": 191
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 0.3331471399925095,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.406,
+      "step": 192
+    },
+    {
+      "epoch": 1.544,
+      "grad_norm": 0.36824739532374523,
+      "learning_rate": 2.6150575548982292e-05,
+      "loss": 0.4413,
+      "step": 193
+    },
+    {
+      "epoch": 1.552,
+      "grad_norm": 0.37028915853531374,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.4289,
+      "step": 194
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.3377837537975486,
+      "learning_rate": 2.4425042564574184e-05,
+      "loss": 0.3847,
+      "step": 195
+    },
+    {
+      "epoch": 1.568,
+      "grad_norm": 0.35978467243099815,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.4319,
+      "step": 196
+    },
+    {
+      "epoch": 1.576,
+      "grad_norm": 0.35418080181674005,
+      "learning_rate": 2.2750452345848682e-05,
+      "loss": 0.4199,
+      "step": 197
+    },
+    {
+      "epoch": 1.584,
+      "grad_norm": 0.37268287174991027,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.4087,
+      "step": 198
+    },
+    {
+      "epoch": 1.592,
+      "grad_norm": 0.3409473333562335,
+      "learning_rate": 2.112793368281799e-05,
+      "loss": 0.4342,
+      "step": 199
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.355860200634431,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.37,
+      "step": 200
+    },
+    {
+      "epoch": 1.608,
+      "grad_norm": 0.35012224620268156,
+      "learning_rate": 1.9558580265652448e-05,
+      "loss": 0.394,
+      "step": 201
+    },
+    {
+      "epoch": 1.616,
+      "grad_norm": 0.3695753625239139,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.4148,
+      "step": 202
+    },
+    {
+      "epoch": 1.624,
+      "grad_norm": 0.4258711912122203,
+      "learning_rate": 1.804344994745727e-05,
+      "loss": 0.4375,
+      "step": 203
+    },
+    {
+      "epoch": 1.6320000000000001,
+      "grad_norm": 0.3803097291019688,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.4166,
+      "step": 204
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.37465233653565705,
+      "learning_rate": 1.6583564031206357e-05,
+      "loss": 0.4016,
+      "step": 205
+    },
+    {
+      "epoch": 1.6480000000000001,
+      "grad_norm": 0.3672479996101215,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.406,
+      "step": 206
+    },
+    {
+      "epoch": 1.6560000000000001,
+      "grad_norm": 0.3774177953897066,
+      "learning_rate": 1.5179906581313064e-05,
+      "loss": 0.4288,
+      "step": 207
+    },
+    {
+      "epoch": 1.6640000000000001,
+      "grad_norm": 0.3710652365358924,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.4011,
+      "step": 208
+    },
+    {
+      "epoch": 1.6720000000000002,
+      "grad_norm": 0.3471124217622006,
+      "learning_rate": 1.3833423760302611e-05,
+      "loss": 0.3787,
+      "step": 209
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.37562050193480484,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.456,
+      "step": 210
+    },
+    {
+      "epoch": 1.688,
+      "grad_norm": 0.39588222129140765,
+      "learning_rate": 1.2545023191032801e-05,
+      "loss": 0.3887,
+      "step": 211
+    },
+    {
+      "epoch": 1.696,
+      "grad_norm": 0.3602840989644041,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.3935,
+      "step": 212
+    },
+    {
+      "epoch": 1.704,
+      "grad_norm": 0.34855973676200813,
+      "learning_rate": 1.131557334489326e-05,
+      "loss": 0.3749,
+      "step": 213
+    },
+    {
+      "epoch": 1.712,
+      "grad_norm": 0.3479812256611428,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.3951,
+      "step": 214
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.3818486969634882,
+      "learning_rate": 1.0145902956395447e-05,
+      "loss": 0.4067,
+      "step": 215
+    },
+    {
+      "epoch": 1.728,
+      "grad_norm": 0.35208306834551606,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.3844,
+      "step": 216
+    },
+    {
+      "epoch": 1.736,
+      "grad_norm": 0.3877100998448152,
+      "learning_rate": 9.036800464548157e-06,
+      "loss": 0.4431,
+      "step": 217
+    },
+    {
+      "epoch": 1.744,
+      "grad_norm": 0.38366231473311935,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.3757,
+      "step": 218
+    },
+    {
+      "epoch": 1.752,
+      "grad_norm": 0.38165201326373444,
+      "learning_rate": 7.989013481394814e-06,
+      "loss": 0.4083,
+      "step": 219
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.36942326110852475,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.4043,
+      "step": 220
+    },
+    {
+      "epoch": 1.768,
+      "grad_norm": 0.3547002213884238,
+      "learning_rate": 7.003248288071118e-06,
+      "loss": 0.3822,
+      "step": 221
+    },
+    {
+      "epoch": 1.776,
+      "grad_norm": 0.3336558366840081,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.3822,
+      "step": 222
+    },
+    {
+      "epoch": 1.784,
+      "grad_norm": 0.37406602326195987,
+      "learning_rate": 6.08016935872251e-06,
+      "loss": 0.4182,
+      "step": 223
+    },
+    {
+      "epoch": 1.792,
+      "grad_norm": 0.3713633267545657,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.4094,
+      "step": 224
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.38273162844988756,
+      "learning_rate": 5.22039891260262e-06,
+      "loss": 0.4072,
+      "step": 225
+    },
+    {
+      "epoch": 1.808,
+      "grad_norm": 0.3509985921798932,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.3954,
+      "step": 226
+    },
+    {
+      "epoch": 1.8159999999999998,
+      "grad_norm": 0.37305289489211013,
+      "learning_rate": 4.424516494654118e-06,
+      "loss": 0.4254,
+      "step": 227
+    },
+    {
+      "epoch": 1.8239999999999998,
+      "grad_norm": 0.3965276510176484,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.4322,
+      "step": 228
+    },
+    {
+      "epoch": 1.8319999999999999,
+      "grad_norm": 0.3685211551685983,
+      "learning_rate": 3.693058584855369e-06,
+      "loss": 0.4307,
+      "step": 229
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.3686387036160562,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.4582,
+      "step": 230
+    },
+    {
+      "epoch": 1.8479999999999999,
+      "grad_norm": 0.40006221636184197,
+      "learning_rate": 3.026518236595621e-06,
+      "loss": 0.4319,
+      "step": 231
+    },
+    {
+      "epoch": 1.8559999999999999,
+      "grad_norm": 0.3686101936012674,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.4401,
+      "step": 232
+    },
+    {
+      "epoch": 1.8639999999999999,
+      "grad_norm": 0.3639816237696081,
+      "learning_rate": 2.4253447443228106e-06,
+      "loss": 0.4228,
+      "step": 233
+    },
+    {
+      "epoch": 1.8719999999999999,
+      "grad_norm": 0.3914836318682469,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.4232,
+      "step": 234
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.3475812805328899,
+      "learning_rate": 1.8899433406879608e-06,
+      "loss": 0.4048,
+      "step": 235
+    },
+    {
+      "epoch": 1.888,
+      "grad_norm": 0.3819179857607728,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.4349,
+      "step": 236
+    },
+    {
+      "epoch": 1.896,
+      "grad_norm": 0.34980986059743474,
+      "learning_rate": 1.4206749233902084e-06,
+      "loss": 0.3982,
+      "step": 237
+    },
+    {
+      "epoch": 1.904,
+      "grad_norm": 0.3618752000752406,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.3869,
+      "step": 238
+    },
+    {
+      "epoch": 1.912,
+      "grad_norm": 0.3617386662356333,
+      "learning_rate": 1.0178558119067315e-06,
+      "loss": 0.4229,
+      "step": 239
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.3440710649386622,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.3715,
+      "step": 240
+    },
+    {
+      "epoch": 1.928,
+      "grad_norm": 0.41173420560281065,
+      "learning_rate": 6.817575342714988e-07,
+      "loss": 0.4432,
+      "step": 241
+    },
+    {
+      "epoch": 1.936,
+      "grad_norm": 0.3707711016435056,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.3993,
+      "step": 242
+    },
+    {
+      "epoch": 1.944,
+      "grad_norm": 0.41577428666946314,
+      "learning_rate": 4.126066440464982e-07,
+      "loss": 0.3685,
+      "step": 243
+    },
+    {
+      "epoch": 1.952,
+      "grad_norm": 0.3814321087081568,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.4361,
+      "step": 244
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.3721575866482138,
+      "learning_rate": 2.1058456760891798e-07,
+      "loss": 0.4032,
+      "step": 245
+    },
+    {
+      "epoch": 1.968,
+      "grad_norm": 0.41596386646666555,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.4572,
+      "step": 246
+    },
+    {
+      "epoch": 1.976,
+      "grad_norm": 0.40832138728799905,
+      "learning_rate": 7.582748185719358e-08,
+      "loss": 0.417,
+      "step": 247
+    },
+    {
+      "epoch": 1.984,
+      "grad_norm": 0.3473444140339669,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.3723,
+      "step": 248
+    },
+    {
+      "epoch": 1.992,
+      "grad_norm": 0.3546075160485246,
+      "learning_rate": 8.426222418311814e-09,
+      "loss": 0.3919,
+      "step": 249
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.3512853828343401,
+      "learning_rate": 0.0,
+      "loss": 0.3783,
+      "step": 250
+    },
+    {
+      "epoch": 2.0,
+      "step": 250,
+      "total_flos": 246886161580032.0,
+      "train_loss": 0.5540215849876404,
+      "train_runtime": 3986.2101,
+      "train_samples_per_second": 1.003,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 246886161580032.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8027b4d955825d8c902a7977d7bb207d2c735951
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e959511bb990b4f2a368e96c598aabae0ba51f9f
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ee94bcd70cdfc6d418cd4ede64ab52c3f5744b251a02d5bba5942a3744d7807
+size 671150064
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8815adc96801343827d2a39db8ac806ca8fe7324
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b74c7e1c3ceb8e3b785c8f79cfd5cea3179108ee16ae6e7b5f134c9efe2e39a
+size 918507402
diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8fa1f947e2214f33195d12572f14edef07c45f3f
--- /dev/null
+++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7457730890858434,
+      "learning_rate": 2e-05,
+      "loss": 0.9392,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7630371133565212,
+      "learning_rate": 4e-05,
+      "loss": 0.9595,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.671741960683176,
+      "learning_rate": 6e-05,
+      "loss": 0.9466,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.45841791491457373,
+      "learning_rate": 8e-05,
+      "loss": 0.8459,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5244239392744238,
+      "learning_rate": 0.0001,
+      "loss": 0.8724,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6516328157503206,
+      "learning_rate": 0.00012,
+      "loss": 0.819,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.49530456242105575,
+      "learning_rate": 0.00014,
+      "loss": 0.824,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4596397773308755,
+      "learning_rate": 0.00016,
+      "loss": 0.7665,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.44608821672777943,
+      "learning_rate": 0.00018,
+      "loss": 0.796,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.40026645073169514,
+      "learning_rate": 0.0002,
+      "loss": 0.7114,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.42152892282804516,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.7637,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.6645956201488676,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.7394,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4559407361234458,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.7769,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4572611529813379,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.7066,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4500067808913182,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.7371,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4197230755489818,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.7665,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.36391045871614297,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.7045,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.3872776544353665,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.7429,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.40240763245757283,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.7356,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.3589800680715318,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.7669,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.3529214687594287,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.7162,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3731504462501941,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.7032,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.357510448570862,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.6901,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4240364640458765,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.704,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3439785502742619,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.6762,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3313605555697624,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.668,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.35837445292114534,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.6628,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.38609941012926274,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.7309,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.35471157176289014,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.7492,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.34328303627248624,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.7155,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.36673916279013236,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.7241,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3234177882633011,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.6861,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.32299347366268083,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.6615,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.3635950424319359,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.7291,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.313827921433598,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.6951,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3968710602244457,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.6907,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3433259010957558,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.7033,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5783899904182404,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.6653,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3676724843428395,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.7422,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3243352625036398,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.6482,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.34547858090045774,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.6816,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.32765881962059995,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.6874,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.38403027942722356,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7531,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.36817048699658556,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.7186,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3423144655043922,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.6902,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.32637477572324414,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.6774,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.36502486947017465,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.6889,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.40991300125547164,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.6986,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.35479733232421357,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.6632,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3672867016153713,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.6963,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3002688404746075,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.6864,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3324674434885357,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.6438,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3601713479724341,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.6184,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3249708764798294,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.6917,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3452207394377197,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.6914,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3402007827003844,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.69,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.39866643674994495,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.6917,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.33719844735024124,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.6461,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4103903760834288,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7571,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3247408010466196,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7127,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3019357718139876,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.63,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.32110595479702053,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.6407,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3459032253481727,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.7009,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3643552745116129,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.6431,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.3287998055114684,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.7143,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3286988331002986,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.6417,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.32519808551208057,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.6687,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.37437821580599384,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.698,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3135688689683587,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.6652,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3312906601598693,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.6685,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3039247867103697,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.6245,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3086328804027952,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.6405,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3610255480279461,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.6895,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.34434512108300025,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.6932,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3810006950863619,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7155,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3722738863269969,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.6964,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.34270037865742503,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.686,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.30772890236835826,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.6469,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.31029540900645003,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.6407,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.32844082133524866,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.6526,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.33040883292034673,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.6682,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.37490784330706134,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.6697,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3062844132482711,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.6464,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.35671527882571014,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.6666,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.35176591059029183,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.6331,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3379988728130438,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.6719,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.36243107510700484,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.6481,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3203238702127848,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.6424,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3368827289630954,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.6767,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3188974603799366,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.6543,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.33675315618562107,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.651,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.33898214421058065,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.6441,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.2983027225473012,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.6124,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.302608615368401,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.6199,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.28961037507312426,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.6032,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.31889345457891505,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.6335,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3349103755034773,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.6384,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3116776065640942,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.6404,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3340743011867128,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.6499,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.35774989469652474,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.6728,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.29207981025345064,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.6294,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.34093300396933807,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.6515,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4550580716289749,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.6658,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3549625285796341,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.6705,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.41015881382795516,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.65,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.33220099050742774,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.6247,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.33820658657375424,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.6728,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.34870862445670225,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.6613,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.2994763222445106,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.6296,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3258903531906652,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.6723,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3216665038573486,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.6383,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3707207885410005,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.67,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.31713373063325917,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.6414,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3279430359127387,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.6174,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3328376067264693,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.6611,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.30883182781096974,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.6343,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3414957274784356,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.6418,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.34237503688138976,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.6705,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3563435268997631,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.67,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.29555256017828213,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.6188,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3894091065376875,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.5784,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.30171407654719007,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.658,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3434709265832977,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.6915,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.32041483095191065,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.6158,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.33393110124438835,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.6575,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3387835672635283,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.6555,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.33957636048101425,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.6804,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.34764023958223117,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.6463,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.30499431339882926,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.6012,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.40182928175919536,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.6684,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3582372625393394,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.6767,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.354828481051296,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.6673,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3205163623498333,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.6508,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.32106540344304196,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.6447,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.32479318644166927,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.6456,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3501963258782712,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.6437,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3097916234964394,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.6224,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.33887867132062244,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.6574,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.28865597440102597,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.5969,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.29899423235735395,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.594,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.30879258648168484,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.6337,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3388183296668431,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.673,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3537905016162661,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.6195,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.33907350883788884,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.6509,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.33481880505361394,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.6373,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3485772842962881,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.6452,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3093219837930065,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.6055,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.33305496212146257,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.5984,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.33472414511634047,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.6757,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.32435244822651405,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.6251,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.31796653746835785,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.5902,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.33268287537089214,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.6279,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3271900702908842,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.6561,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.33126008778280047,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.6536,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.34781823010454777,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.6579,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3073647432254352,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.616,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.31755966518040124,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.6735,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3381564369674285,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.6511,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.2972906789467651,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.626,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.32531081197126965,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.6333,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.29158833917517063,
+      "learning_rate": 0.0001,
+      "loss": 0.6049,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.30238077259284174,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.6035,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.31722530427474355,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.6041,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.30151547809589097,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.5839,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.34665183430334884,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.5952,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.30772019611290397,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.6552,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.28516168246199697,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.586,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3091269679334812,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.6509,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.31298172317771933,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.6399,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3125082847550607,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.5864,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.33615452322346095,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.6538,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.280194545648247,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.6142,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.30664071129075987,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.6081,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3302810403450827,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.5998,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.33210002602685823,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.6207,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.30444517344086625,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.6556,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3039785181090473,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.6177,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3066941689270814,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.603,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3528154065914179,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.6599,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4495795663737578,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.6187,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3184430356828446,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.6517,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.31099668809147824,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.6026,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.2833253101198598,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.6062,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.31858986421365953,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.6014,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3186441028771271,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.6247,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3152306416432087,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.5964,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3123285565579661,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.6082,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3142125866867691,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.6241,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.30715777727174004,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.6207,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.31222220068605205,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6354,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.29608382663769417,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.622,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3099543613136313,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.6128,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3193556644485293,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.6693,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.34238637106239805,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.6133,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.2660721685995888,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.5928,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.312403615349147,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.645,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.32919867469057845,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.6037,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.29608072859945217,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.6108,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.29644956796385985,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.598,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3287260629333863,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.6136,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.29751135110841737,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.6009,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3025139827613611,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.5997,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.28291034044323077,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.552,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.29253931579179776,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.5657,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.30854967094455543,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.6203,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.2928674567367288,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.5918,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3022165498099548,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.5768,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3373168884415293,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.5954,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.31858218039290526,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.6315,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.31784021073230706,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.5988,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.31569488085577824,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.6117,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3391114574160496,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.6308,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3099686225231022,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.5874,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3272152195311452,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.6295,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3006309655603955,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.6095,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2917718896861704,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.5584,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3256058776538367,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6446,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.30540029018850307,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.5903,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.2768986114277422,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.5508,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.319691457038105,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.5967,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3245235055980944,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.6145,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2971007844582699,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.6083,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3132403382765072,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6197,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3029269929031576,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.5976,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2943639707173013,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.5643,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3259079582732509,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.6379,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3912281847418963,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.6369,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3392790073664649,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.6046,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3093047016302224,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6305,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.31740563814051154,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.5927,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.35757022744841516,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.6589,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.2771972396943578,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.556,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3112545964276882,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.5963,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.28881758258048246,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.5699,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.29705034758563464,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.6018,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.30799071774314674,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.61,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.327747513366901,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6307,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.2896705972194219,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.5946,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3010433149593574,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.61,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3215726833693975,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.5684,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.28698128890577407,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.5704,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2874906284664439,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.5546,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3274733948655497,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6283,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3407257862140524,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6226,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.30940111584956204,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.6182,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.318109829131004,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6321,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.29914292634306344,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.599,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.2797871835987113,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.5546,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.30229086592693627,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6192,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2954873259014947,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.5913,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.2950203262186019,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.561,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.28779718073226096,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.5939,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.314887711005164,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.5933,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.2986903753941433,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.5932,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.2826250372389659,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.5509,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.33381206504142535,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6154,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.30596472002594716,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.594,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.35185807954331105,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.5958,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.28230006882220116,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.5486,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3130668066124635,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.6108,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3294167678586574,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.6684,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.32078244804037015,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.6503,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.47405184916696186,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.5995,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3070564349588061,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.5683,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.2946455471506841,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6075,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.31174149044192556,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.6102,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.2894262410662935,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.5679,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3072867937660247,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.5944,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3141327748974818,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.6146,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3297375563349942,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6093,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3025268519097175,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.5715,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.2763400100984386,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.5531,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3057061440220331,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.565,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.30568364567525963,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.5713,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3331966345949505,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.6677,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.2908370654322784,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.5812,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.26727331401737486,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.5618,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.312133529139538,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.6026,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.2922244076597637,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.5905,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.30187540342600905,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6277,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.34788501107378356,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.6326,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.27072389174734296,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.5552,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.2931131966977575,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.5835,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3082429395866023,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.5704,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.30589246711582313,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.6061,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.28004706080113423,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.5771,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3048928905270255,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.5453,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2903529261392363,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.5858,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.29670524308182067,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.5713,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3038955745133722,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.6174,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3201029301410521,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.5997,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.30209605384437654,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.5846,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.2843990942016293,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.5617,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3117408742395904,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.6211,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.29773734562986487,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.5651,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.281978621308173,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.5525,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.32845611494771243,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.5862,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3506177862675044,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.585,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3297169416792786,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.63,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.28035403388693836,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.5548,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3001871927203313,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.6112,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.2795530026041373,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.5666,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3139555068178617,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.5986,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.31776585164466525,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.5706,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.2936353664174258,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.5616,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3198153539489569,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.6109,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3199566602187319,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.6099,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.31573660141129345,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6208,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.30735316287976444,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.6242,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.33578478937957057,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.59,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.32324798409195554,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6159,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.31751165611922544,
+      "learning_rate": 0.0,
+      "loss": 0.592,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 305815172284416.0,
+      "train_loss": 0.6410520928792465,
+      "train_runtime": 4972.4596,
+      "train_samples_per_second": 1.006,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 305815172284416.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..571f183351ceab6dc287cddd61c1ec2dd10b069b
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "q_proj",
+    "o_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4c7dd911b2b71f3a127404b0be8cbb3561fe15c1
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70e6a23753094a97b50ff3abbd77362a6b1f9b843a8a5fc00991c7d5ad8a9f22
+size 671150064
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2d55c3fbf822ac17eba2e044eb8bf18d5af6ebae
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fba640a818e1406f5718e38d454b65bc39aa09bf28dcfbaaf649cd9805a6a21
+size 918507402
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f9883d89f621211dd0c11158f5f59624fd1a88d
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 4.079327622118584,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4765,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 4.40113376763699,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4812,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 2.852285481000278,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.4284,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 2.4225314639300977,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.3549,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.9486859923452315,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1165,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.9208121876027364,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0992,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.732766800590471,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9592,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.5637155696936844,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.8463,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 2.0586132784503013,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9338,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.9050656462455207,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9428,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 1.5521805714320187,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8948,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.3914837471605024,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8751,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 1.3037317191929243,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8701,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.1895932626241843,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9284,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 1.1682465876717918,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.7463,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.3458482300267895,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8145,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 1.3343115842350872,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.78,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 1.345994899831727,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8096,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 1.5179124182931218,
+      "learning_rate": 0.0002,
+      "loss": 0.9103,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.453884829604924,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8481,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 1.3244483538701937,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9012,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.4273501446403416,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8483,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 1.351681032600888,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8476,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.2002303746648013,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.7745,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.224676997564932,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8161,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.2521722157868296,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8237,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 1.3839045461488768,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.796,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 1.4393329250022675,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.9021,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 1.1708534336814247,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8184,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.3971888030003934,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.83,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 1.3950041409572966,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.791,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 1.3172061836573148,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7919,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 1.1674279258163187,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.7204,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 1.216402015685836,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.839,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 1.1738455727715729,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.9203,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 1.3324423007220763,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8302,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 1.1692674041237634,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.787,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 1.129891896166518,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8299,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 1.4645497511737668,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9184,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.1889881200623615,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8212,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 1.3882019110988386,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8463,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 1.2617605007479555,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7893,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 1.3199721270879063,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.771,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 1.3256258709641475,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.7393,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 1.1326132922389427,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7181,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 1.1341954775455474,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8069,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 1.109294934202468,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7307,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 1.3263077512939443,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.733,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 1.0214273654864277,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.7177,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.2125792061110614,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8271,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 1.2990697925289099,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.819,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 1.1626490409509023,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7431,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 1.3252770453069025,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.9627,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 1.2275945378216162,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7013,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 1.1206286767819427,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7889,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 1.2147862865201278,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.879,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 1.0970231076045274,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.7385,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 1.3233091017326513,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8113,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 1.1792782814591045,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.832,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 1.0095028078066783,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.6964,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 1.6843718249061146,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.7526,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 1.1186319562327094,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.734,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 1.1719208207564302,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.868,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 1.0609711989245456,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7287,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.9692517342409362,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.6865,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 1.153771253083348,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8092,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 1.2623889816531393,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.6917,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 1.1208482751559274,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.8146,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 1.2217174035955245,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7433,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 1.2026904931491873,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.9273,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 1.112998793185748,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7562,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 1.0998903851247233,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7692,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 1.0972310916444612,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.685,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 1.1538532053917443,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8985,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.2150127231448657,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.788,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 1.2428997345801873,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8087,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 1.180723999696628,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7937,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 1.1698683675027783,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8433,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 1.196657806303993,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7706,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 1.2298160597232919,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7931,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 1.1391045969000217,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7492,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 1.1205475244780054,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7671,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 1.18514036810532,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8046,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 1.1780736050925318,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.9289,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 1.110286792602596,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.808,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 1.1733343791671424,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.9261,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 1.0814328442374197,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8203,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 1.1385768725890104,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7974,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 1.0216275712576461,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.6849,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 1.327797248643074,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8543,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 1.183051703626731,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8368,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 1.1337857151472512,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.6706,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 1.135855789118863,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7744,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 1.3569335916233816,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8503,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 1.1437156491959917,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7745,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 1.113672148884586,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.727,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 1.0071472374514407,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7332,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 1.1103465364736207,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8267,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 1.0693141101920882,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.779,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9573192627909511,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.6754,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 1.056695179414592,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7361,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 1.1824410547847057,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7389,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 1.240303615114854,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8205,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 1.0851520282652303,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7433,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 1.2824427242383798,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8288,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 1.2027009179273351,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8518,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 1.0018538317795969,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7416,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 1.0773970729250346,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7264,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 1.0385112139274224,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7199,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.1202961144635684,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7759,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 1.10002835715013,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7672,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 1.1523630454468816,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.782,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 1.2910603290325495,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8601,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 1.0313609737226423,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7411,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 1.2011834969161035,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.8854,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 1.0578982681367388,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7166,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 1.126190003093847,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.728,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 1.1364729312140345,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7879,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.9556525935880364,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7246,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1.0169082348426133,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.692,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 1.1399661009178772,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8633,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 1.025597618650162,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7531,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 1.1140897475974443,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7009,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.989635599878665,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7085,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.9861571045739278,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7402,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 1.1280775918681303,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7753,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.970219215323538,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.6952,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 1.0634205518795208,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.9528,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.9079606051649208,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.6912,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 1.0830255428134676,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7586,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.9682713423800534,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7374,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.9778161706645351,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.675,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 1.1138474318789258,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.8055,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.9632096169407617,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7298,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 1.09864313311594,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7825,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 1.0390210658058932,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.6806,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 1.2086212594832617,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7926,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 1.0230209515691793,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.6909,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 1.1026368694000623,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7129,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 1.2764382720926033,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7805,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 1.005584536935183,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.6905,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 1.1944269380445902,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.9269,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 1.0683837257803421,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7768,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 1.445206274329109,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7828,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.9845662569670789,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7021,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.9426691715777189,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7166,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 1.095566507890165,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7469,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 1.0000912230334456,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7413,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.9985260964561642,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7222,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.0365770710824433,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.6543,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 1.0560049830637563,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.6799,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 1.2285475773918355,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.6613,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 1.1948610951408163,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7119,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 1.1392864200733446,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7611,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 1.026286407883845,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.78,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 1.0817341923761168,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.803,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 1.098411059588056,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8301,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 1.0307802416356842,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7403,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.9227971563943753,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.6352,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.9404002808142968,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.707,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.9868828966467482,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7391,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 1.208640381494263,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8004,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 1.3273546340028264,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7896,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 1.1911031665705774,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.8223,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 1.098678240240061,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6788,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 1.1857382862897203,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7293,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 1.007216283110517,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7151,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 1.264950302161379,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.772,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.982465076866397,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.6562,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 1.0399829085179315,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7398,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.8972089218935757,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.6605,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 1.0072069220509277,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7291,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 1.1137214709250107,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7609,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 1.0323488374422616,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7059,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.1455282929819348,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7978,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 1.2518720150891187,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7756,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 1.139310258657853,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8073,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 1.0180182174487173,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.6822,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 1.0863077091069504,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7754,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.9798952036583372,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7036,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 1.0702131643504458,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7164,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 1.1092023425287232,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7876,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.9681477457769564,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7024,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 1.0777622448498192,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7934,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 1.0378556048281868,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6783,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 1.078799504586605,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7475,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.9880842745377312,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.6907,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 1.3298912447392872,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7766,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 1.0478723955052052,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.6743,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 1.082513016613253,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7341,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 1.1597793053417285,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7885,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 1.0514479318888261,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7331,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 1.0601751982528507,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.6553,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 1.0319484697425518,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7605,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 1.012216598945127,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.6985,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 1.0507711581883221,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7649,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 1.1102733037788175,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8312,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 1.1723596161929746,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.712,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 1.0108375187081908,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7188,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.9392203267720511,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6394,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 1.150752101244234,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7492,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.9587303154259936,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.6729,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.9855902546008898,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.633,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 1.060546855777705,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6774,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 1.0355711807123962,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7186,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 1.1326202360080857,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7048,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 1.119484491703591,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7221,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 1.0831780159710254,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.6418,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 1.0552261943119159,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7194,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 1.0329416858025888,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.6308,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 1.0549094744987852,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7516,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 1.206286897863987,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.6335,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.9668575563358071,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.6736,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 1.1902121758328896,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7796,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 1.035141759747216,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7214,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 1.0424215694596601,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7808,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.9479099900527113,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.627,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 1.1020536345657894,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7508,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.9970714751839678,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.779,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 1.2921192337162992,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7891,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 1.0558980245055478,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8122,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 1.0805873429862258,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8215,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.9248577857440794,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.592,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 1.123236762677146,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7329,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.895888853884425,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.604,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 1.0126898243531504,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7117,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 1.090178683291668,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7499,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 1.0905080067559638,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7259,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 1.2249133092092843,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8601,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 1.2319777842616844,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.5961,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 1.0604549084816592,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6648,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 1.042442851312364,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6882,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.975840517205056,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6608,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 1.102072511804385,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7783,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 1.0868007224800307,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7766,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 1.1404920509896797,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7486,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 1.14229321612636,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7688,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.8648821946642871,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6948,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.9551010631354665,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7006,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.9608900542742783,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7046,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.9590920777826683,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.6704,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 1.0230550565367582,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.735,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 1.0291519546724859,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.8103,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.9339980403141241,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.6413,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 1.0511443805713403,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7237,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 1.0922518145677038,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7535,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 1.2553331397373684,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7583,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.9743102635915039,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.6881,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 1.1036394719223697,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.8104,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.9788220277837504,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7212,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.960277760082036,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6453,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.9458327687010233,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.668,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.9747996311203249,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.6374,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.9456870504677144,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6517,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.8625374012779374,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.5754,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.9080357709864995,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.6524,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 1.075255370157194,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.8184,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.9120963289083652,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6736,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.8977532308272703,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.5864,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 1.056234611336399,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7144,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.9401494515757224,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.6107,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.9707218164731339,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.6492,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 1.3642147649660619,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.8226,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.9492156617664067,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.6348,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.9818255136201256,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6432,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.9075058520846708,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.6809,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.9161334178248897,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.639,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 1.0330443042304416,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6901,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 1.0947898284412185,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7464,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 1.004213113420765,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7511,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.9347148068533467,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6544,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.9978120044481448,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6873,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.9809643859106156,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6544,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 1.0029170228266115,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6255,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.9417083512697166,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6818,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 1.111134559079807,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.785,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.869568952356725,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6143,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 1.1588538526446959,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7077,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.8564976410122681,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.5817,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.9835681341640148,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.675,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.9661752945573999,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6143,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.9771623452698963,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.6958,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 1.044800176172783,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.8356,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.9774178207640642,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6511,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.8994286932155998,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6797,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 1.0613102619576853,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7288,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.9167212286755693,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6564,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.925114318133125,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6213,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.8982728232717752,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6834,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.9549955696620802,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.5918,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 1.1297928094792418,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7396,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.8772073437397565,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6218,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.9165917991301511,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.5938,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.9603735838232083,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6543,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.9506735479947094,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.6704,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 1.1405296151018007,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7492,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.948725655617116,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.5824,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.8840195525822759,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.5742,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 1.0050823635292419,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7589,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9518152908863455,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6223,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.9935413533356781,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6868,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.9391071497672916,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.67,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 1.0111144448690341,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6036,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 1.0528200546723232,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6703,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.9604242335429634,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6515,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.9714302901929514,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7044,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.891198807596005,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.5525,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 1.0040614684000009,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6712,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.9502194665760952,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7171,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.9344284914557824,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.5947,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 1.5216884120567316,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7081,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 1.0529099246063125,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7295,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.9624795340361507,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6547,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 1.0277158131759265,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6217,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 1.170824964889325,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7425,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.9519262978283264,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7318,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.9481035069089333,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6413,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 1.2202753892971472,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.5965,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.8646568279176724,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6388,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.9370755125251587,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7069,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.9558369774418428,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6096,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.8798063213833051,
+      "learning_rate": 0.0001,
+      "loss": 0.5768,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 1.0451492218020761,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7366,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.924152447792308,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.664,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.0744081909607641,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7296,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.955668756734273,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6078,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 1.3144636543595971,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.73,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 1.0668093428299052,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7291,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.9512383602185149,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6279,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 1.2922466129710943,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7305,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 1.0772051024912837,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6187,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.958020772759464,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6412,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.9069568866284937,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.5917,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.9297065653874017,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.5462,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.9650420091823859,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6332,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 1.1380448305985547,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.6879,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.924710104212599,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6348,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 1.0015805912872244,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7113,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.9384437671302087,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6188,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 1.0604948892936632,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7284,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.9831510324855539,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6808,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 1.0069678330531393,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.5993,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.9680924106629697,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6227,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.888854482469667,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6456,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.9281177668939593,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6306,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.9751738708275449,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6838,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.9582206003918131,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6507,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.9001648649106497,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6308,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.9092857558878517,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.9636102854062633,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7061,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.9504775655843016,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7104,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.9057432352639376,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.5754,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 1.0007313803850506,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6785,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.9177411475104688,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.5778,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 1.0005491489675047,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.665,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.8983210808586486,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.5667,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.9795152857862884,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6444,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.873584571853934,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.615,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.9549504359740664,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6288,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.9188240525915284,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.5946,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.8607473920140348,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.5552,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.8639807183747521,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.5721,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.8850184972345249,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.5223,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 2.1591022434661853,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.5993,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 1.1043226783052784,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7933,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 1.0884233624626425,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7357,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 1.015113606562981,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6774,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 1.0502589325922018,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6794,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.9900878921092949,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6068,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.9497748309914926,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.5922,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.9705843187607024,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6093,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 1.010373441513078,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6703,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 2.1607022629679125,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.5826,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.9856272909655424,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6254,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.9295906722233483,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6555,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 1.0457233381248927,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6991,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 1.0108760483023163,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6769,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.9057058186724547,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.5315,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.9369047617658811,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6602,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.972680202054679,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6024,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 1.321048930261487,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6481,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 1.0577345819836654,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6457,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.9218333728889919,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6023,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.9399207896564503,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.5672,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 1.1250946584883899,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7217,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.9522625346054029,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.6172,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.9922580878539909,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6201,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 1.0168541498929677,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.637,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.9687595220988009,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.5511,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.98923428378789,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6611,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 1.0975903332325236,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.638,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.9211169122137681,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.5502,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 1.1161164797290022,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.695,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.9946376550649852,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6079,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.967078877686999,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.5904,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 1.0989047275834596,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7644,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 1.0749511648643626,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6075,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 1.0113772216181294,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6055,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 1.0528699708098506,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.5924,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.9884474205488997,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.5836,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 1.0817565853558482,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.5429,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.9666144235321176,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6413,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.9153559071033491,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.659,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.9137471340069504,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6103,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 1.510146019378789,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.4828,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.987447720692412,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7018,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.9238724147222424,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.5758,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.9389633061963913,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6042,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.9132362305366166,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.5972,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.8654296238944921,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.5282,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.8453176077925335,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.5726,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.9242326747879537,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6372,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.9308988892533574,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.5738,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.9071409171308088,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.5761,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 1.3028630461587731,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6067,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.9172024043826201,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.5679,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.8807958549351818,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.542,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.9704249571552533,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.5464,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 1.0969144972578753,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6738,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.8696864132878669,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.5705,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.9781208222151636,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.5582,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.9967778574729614,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6579,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 1.0082960039077469,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6184,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 1.0366166862534494,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.5641,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.9920315082734081,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.669,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.9956020669208389,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.5392,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.8490541008850564,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.5029,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.9163974087483752,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.531,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 1.008949676876316,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.637,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.9030007856605029,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.5628,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.9449537478248724,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.5636,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.8198257100363766,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.4939,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.8867518601433609,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.5413,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 1.1818256787206338,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6136,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.9452893844358714,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.5748,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.9275850629781354,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.5189,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.9002710366246102,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.597,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.9415059576235015,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6083,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.9647744573314104,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6556,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.8670661153270305,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.473,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.9646118936497053,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.5672,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.9095148534965027,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.5433,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 1.062269147102224,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6931,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.9241710738105149,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.5148,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.9559033451868152,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.5564,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 1.030323125995463,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6195,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 1.0271604613717926,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7014,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.8942452975398771,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.5011,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.8955644246748649,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.5521,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8332928517210115,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.5414,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 1.1169520622611744,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6897,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 1.3097679094718664,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.5497,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.8772376782312741,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.5951,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 1.1755811538495757,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6902,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.9150533628307943,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.5854,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.9812405096651364,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.5138,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.9257901458035728,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.5882,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.9099467473629124,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.551,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.9739503971331055,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.5545,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.8385781188398105,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.4841,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 1.1359826928085195,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7864,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.9909228203441943,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.5488,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 1.0019540774010631,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6181,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.8385188190911381,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.5049,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.919376923204666,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.5377,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 1.0232245476746933,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6487,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.9618340330851003,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6218,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.9712726064151155,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.5529,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.9771971192171456,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6552,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.9786775722449325,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.5603,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 1.1332505597262252,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6783,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.9836611457205962,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6031,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.9773706601264938,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6464,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.8695143545764731,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.5364,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.9314585082087594,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6129,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.8352473468581094,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.5302,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.9288707870085203,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6198,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 1.054548843821071,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6421,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.8919318029675155,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.5169,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 1.0066503456444664,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6089,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.9230978252331297,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.5957,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.8789633076103941,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.5541,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.8718844378045656,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.5485,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.9335181761420575,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6404,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.9484725144376542,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.59,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.8382977366416045,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.5561,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.9355603237904612,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.5818,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.961809490288736,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6146,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.8567670714065785,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.5652,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.9455553754652066,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.5788,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 1.0721800816234002,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.5842,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.7675806075541949,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.4302,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.991350128046776,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.5715,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.9511054276695102,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6427,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 1.0426683588308392,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.5714,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.9523954557582163,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5734,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.7987380143628661,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.5117,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.8573326519139678,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.5222,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.979581786160496,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.5971,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.778165869500947,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.4771,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.8457108669461191,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.4731,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.9508003542076502,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6279,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.9958779714256399,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6277,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.9862852127069748,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.5806,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.9397524949225728,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.5331,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.9083043981571588,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.5043,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 1.0118708577850164,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.5932,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 1.1251363208694125,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6196,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.8574627852708463,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.5193,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.7780339043692484,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.4682,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 1.1964531716023608,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6528,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.9404864780809205,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6223,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 1.0226708718743212,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6927,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 1.137349776592833,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6141,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.9814117218965409,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.5735,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.8830516289878225,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.5071,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.996486615239273,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.5259,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 1.0330770451297686,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6197,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.8319899947732018,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.4996,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.9250451531628848,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.5247,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 1.144102148606056,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7603,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.8191778600556882,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.5248,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 1.0247859395725494,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6693,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.9576748722399607,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.5266,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.9386852671410497,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.5242,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 1.0008594863534126,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.5434,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.8296819976374453,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.4819,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 1.0366267401553004,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.648,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.9301658256767201,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.5796,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.9540408598436331,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6085,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.9601452062599933,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6306,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 1.1064698430304876,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7302,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.8406468101560786,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.5,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 1.0896860673883275,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6729,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.912833026472872,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.5126,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.8657286990660678,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.4976,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.825689451879938,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.4834,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.9905337253057244,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.5336,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.9726342394010897,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6517,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.9142314798265335,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6603,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.898642677276061,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.4928,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.8916371940098342,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.5164,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 1.4104850715242292,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.5929,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.7293795782350329,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.4157,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 1.019652420183055,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7263,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.9448381054119659,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.586,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.8637348751302155,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.4999,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.9690117974376475,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6164,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.9531128366285172,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.5583,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.9078704210432088,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.5536,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.9630577217237317,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.5851,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.9270212590280982,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6402,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.9423601972149899,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5566,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.9976033030848523,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5256,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.9393026960150982,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6015,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.9274191647891893,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.579,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.9942581681622578,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.582,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.9681948735153119,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.5492,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.9833395857883787,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6341,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.7933212026979115,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.5183,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 1.0599017928831775,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.762,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.9256304637576591,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.521,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.9585611922630486,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6063,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.8490070058645339,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.4427,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.891364762865935,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.5148,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.9988470856594625,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6633,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.9444281906347085,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.5217,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.8752628215788081,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.4908,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.9694387789394738,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6527,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.9022866716275811,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5921,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.9205268255297675,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.4699,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.8755805675570406,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.4811,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.8912675243422706,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.5741,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.9689570717903273,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6094,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8392206648042595,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.5112,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.8072022169653216,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.448,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.7790347510000438,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.402,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.8953515136947062,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.5285,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.9809443047886482,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.5989,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.8975191336454618,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.555,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.9629421299799577,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.5406,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.8243589514012051,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.4341,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 1.479828868077692,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.5796,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 1.1184626925960366,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.5626,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.7892836016009552,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.4644,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.8453496028010578,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.5184,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 1.415420131082792,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.5357,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 1.9466599285665418,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.5657,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.9510079260717255,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.53,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.8046507506345103,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.4359,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.9085865871187612,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.5194,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 2.1375597713997694,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.5032,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.8858092438131193,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.5091,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.9982469502268863,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.5391,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.8523525415208159,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5074,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 1.0576780431109767,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.521,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 1.2215689985898792,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.542,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.9951221792223286,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5588,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 1.1516393717472841,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.7024,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8699478987557641,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.5441,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.9761622409616965,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6659,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 1.0657941819580883,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6802,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.9483310552984877,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.5146,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 2.9508099527683926,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6694,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 1.1203177018025743,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6277,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.8712036353092558,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.4921,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.8349081617610802,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.4697,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.9816640206077006,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6025,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.8137012439481905,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.5452,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.8775138259188492,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.5815,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.9510137492394066,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5334,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 1.1228685544623338,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6673,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.9574163289616511,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.608,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.9693634569140042,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6054,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.9760444131755792,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.5478,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.9474416415255731,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5137,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 1.0300094506610928,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.5334,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.8898281929370233,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5279,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 1.3279687374109668,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7511,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.854279957525338,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.5012,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.9686724654578658,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.5938,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.9521684426642905,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6124,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.8766730681498675,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.4793,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.8576827239737382,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.4929,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.0180614580257639,
+      "learning_rate": 0.0,
+      "loss": 0.5727,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 82273390780416.0,
+      "train_loss": 0.6731263710498809,
+      "train_runtime": 5195.4773,
+      "train_samples_per_second": 1.925,
+      "train_steps_per_second": 0.12
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 82273390780416.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e8b6814e3d80a4fc8d72542ebf99a79124e6feb
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4b99e56679a991f138b6408e617c8c486856859a
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cabb144a8f73e1386612a4f2e8bdb914dcdc1f1ff59b3ba0cb089fd218695976
+size 671150064
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..663527f23602619fab33e6381707423cb7e97433
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:549752e36daf80a072340c08e532c8c2d7715202ab7e88363018d122504a4ffe
+size 918507402
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..391140927b14c0dfaa2d377b66159d516ecab6ec
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 4.408342819175507,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.4923,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 4.110439939161045,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4308,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 3.627360088009308,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.515,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 2.509499785965118,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4493,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 2.323968524411629,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3003,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 1.995055111232002,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1018,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 1.889932196942419,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.1819,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.6684082430973282,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.02,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 3.7755827325690263,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 0.9298,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 2.196627684433904,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0584,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 1.6824981348422472,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 0.8387,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.6198472340502856,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.8099,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 1.8117757984096798,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.1271,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.5445028531856497,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.8958,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 1.6869380158754907,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.9061,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.4254207766658846,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.8401,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 1.3931299025207386,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.8321,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 1.328820134404943,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8292,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 1.3628683546991005,
+      "learning_rate": 0.0001,
+      "loss": 0.9717,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.071482812407722,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.7228,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 1.2276194299084946,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.7846,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 1.2605068638679624,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8337,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 1.2125163219825974,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.7455,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.3559412240231647,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8704,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.1931836827361815,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.7802,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 1.2815768897397926,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.7516,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 1.2487451213256184,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.8588,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.3428187117644304,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.7912,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 1.5896900378362886,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.7279,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 1.6758848687903816,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8775,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 1.2914936264462082,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8787,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.131260376713681,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8037,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 1.3915706978304456,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.8949,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 1.3423957957870956,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 1.0454,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 1.1888239761622272,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8437,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 1.2529062114232787,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8602,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 1.4199469178386859,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.7949,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 1.414948053728861,
+      "learning_rate": 0.0002,
+      "loss": 0.8271,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 1.1710159773683249,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.7915,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.5657566210361664,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8607,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 1.446707772738992,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.7905,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 1.517378154263656,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9057,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 1.2143992013412503,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.7846,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.1886700399966317,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7071,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 1.1456550756916823,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.8241,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 1.2086948448352206,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8328,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 1.238059074281379,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.7725,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.2570874197951665,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8732,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 1.2636827067135736,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.7409,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.354771516961733,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8311,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 1.1547283885905204,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.706,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.3394959141288645,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.7398,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 1.367883852532359,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8653,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 1.3107142541709331,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8879,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 1.1805821359949669,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.7184,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 1.1968923989864841,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.7353,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 1.4422865807870915,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.849,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 1.2480605497927955,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7776,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 1.2418802053286733,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.7971,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.1927030318890224,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8142,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 1.1873157476816283,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.7529,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 1.2902061763218409,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8838,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 1.340897802880216,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.8265,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 1.2373723619717891,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.719,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 1.107760632682364,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.7628,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 1.1981904383495332,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8435,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 1.1429350930888995,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.7228,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 1.435754871570911,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7299,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 1.1865159800313239,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.8046,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 1.176760552196966,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8158,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 1.10103801941083,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.7268,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 1.305214804300615,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8605,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 1.2743862748586496,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.9368,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 1.0492622374201535,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7463,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.1931733147785315,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.8728,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 1.0460031573743418,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7114,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 1.1399063887491887,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.7837,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 1.0790701153365434,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7044,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 1.1576158790364408,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8326,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.0646241406248742,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.6692,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 1.1308196743214243,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.6967,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 1.149997527773717,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7963,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 1.1742855406191646,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.8313,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 1.127633976627731,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8215,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 1.2741868523423572,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.9249,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 1.1549550551753378,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7753,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 1.222430259153762,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.7563,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 1.0514952641117037,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8027,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 1.086998228036307,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.7755,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 1.1630351309565898,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7805,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 1.184830463331376,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.7548,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 1.434415169297192,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7904,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 1.2341565822260379,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.74,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 1.32604291197053,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8844,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 1.1274789532241294,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.8091,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 1.1333728621491337,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7048,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 1.4217150158455454,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.8696,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 1.236434536080269,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8657,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 1.2906232510193867,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.9025,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.174051064286104,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7729,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 1.190405459676803,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.7598,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 1.2428373437223619,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8066,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 1.0756769612595862,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.7582,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 1.053929056366854,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7495,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 1.302085916712659,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.8239,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 1.2202083319942585,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7572,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 1.123037646990899,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.727,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 1.096235066230464,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8576,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 1.0847203500993057,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.7677,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 1.1951784729681403,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7637,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 1.3106303412164404,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.9177,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 1.01740925515726,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7668,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 1.1165587109703807,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.6997,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 1.339872329337332,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.9134,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 1.155065798032305,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.7457,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 1.115420938539364,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7114,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 1.0275049872706479,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.7621,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 1.1422631177427438,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7088,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 1.1723839805566112,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.7436,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 1.0082428312322236,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7748,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 1.0938380826486647,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.7407,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 1.1495446964574842,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8554,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.9819945252756392,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.7449,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.9841459680536125,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7292,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.377259630104572,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.8627,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 1.0954099330436704,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8581,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 1.0801564424791268,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.7146,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 1.186455419707288,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8809,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 1.1156046251711629,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.8294,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 1.0552736379204224,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7684,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 1.0881246454501003,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.7453,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 1.0023599103674323,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.77,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 1.1699658416978167,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.8204,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 1.1614645359235414,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7786,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 1.097653278367737,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.6808,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 1.1734841782207333,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.8497,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 1.0942018776385893,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.6629,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 1.1238186898701805,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.725,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 1.2471987096128463,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.7983,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 1.22943045949165,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8379,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.9922021980387541,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.7344,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 1.1142637872966648,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7845,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 1.0269659172627574,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.7648,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 1.0338558129722986,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7507,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 1.0709258671144004,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.7821,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 1.252304631033193,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8575,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 1.0327264932414084,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.7447,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 1.2688781875903254,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8508,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 1.1803015987731276,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.7471,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.0899050814056226,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7357,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.9177192363588716,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.6452,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 1.1109819282825883,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7391,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 1.0845202801960454,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.799,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.9784873646403908,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.6742,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 1.1502313941088143,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.7794,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 1.2651075421089168,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8077,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 1.2660307896771321,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.9314,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 1.219453764327284,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.9102,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 1.1752038197472823,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.859,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 1.042717048482976,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7813,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.9947703403682241,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.7639,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 1.0285566389334777,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7345,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 1.029063636323494,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.7483,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 1.0906097120134792,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7249,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 1.1334391639310268,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.7058,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.9833642785833447,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7198,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 1.2967508433454027,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 1.0848,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 1.2014098936434896,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7061,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.9760578979869019,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.6657,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 1.1154382155788707,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.721,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 1.1067342540180944,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7666,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 1.1078308318396748,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7355,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 1.0664046934283806,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.7811,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 1.1422077472163201,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.7691,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.0985709920564717,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.731,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 1.1252405981323013,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7998,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 1.0675403907916068,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.7248,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 1.05184998619709,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7517,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 1.1147301140311476,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.7584,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.9105532274684514,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7178,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 1.0654400102516637,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.8339,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 1.013459177418056,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7937,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 1.0619435864285793,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.8075,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 1.0415089705114855,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.9126,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.9923339843313642,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.6941,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 1.05939359254116,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7379,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 1.056017771383427,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.8552,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 1.1849751523305005,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8967,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 1.0163769370421663,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.6705,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 1.1637448866611821,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7875,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 1.0582185088998888,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.7379,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 1.107910568839856,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8034,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 1.193940802100663,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.9218,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 1.031544957221749,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8096,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 1.0550279337742818,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7969,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 1.339378720092458,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7135,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 1.053673240761701,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.6952,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.9992381473663097,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.6662,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 1.0255698032449383,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.6567,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.133600671042738,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7207,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 1.2134110529027724,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.7027,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 1.2053506400272829,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7832,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 1.1102749097485602,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7118,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 1.178902593048484,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7437,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.9933065180211077,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.7066,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 1.1495869058395793,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7963,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 1.1003146005307034,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.7443,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.9808868668847505,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.6977,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 1.1925770685238926,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.6928,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.927693974621165,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.6247,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 1.0742454706985174,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.7723,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 1.051319474324855,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8035,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 1.1216518209851702,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.7857,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 1.0553361616852703,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7145,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 1.0715026333857263,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.7705,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 1.0211629635492872,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7289,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 1.0409857521666497,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.7827,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.9309511034517031,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7313,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.9576692146265773,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.6621,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.062739137706439,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7597,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.9310388910091685,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.7826,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 1.1253541815203907,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8474,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 1.1166945029798787,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.7838,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 1.1496684622972821,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7847,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.0678223253517922,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.8023,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 1.1793532152413913,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.794,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.9831007867254152,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.6924,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 1.0453689547667198,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7247,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.9882385097853617,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.7262,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 1.080323670807221,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.6887,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.9501516932213585,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.7022,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 1.1400626270113077,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8338,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 1.0003535208374308,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.7794,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 1.1702426650805333,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8726,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 1.184758961113029,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.8259,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 1.112356983214534,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8045,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.9831874184738855,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.6723,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 1.026630504115062,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7538,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 1.0318358455151204,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.7098,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1.0884518238836358,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.762,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 1.1808121910364464,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.8122,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 1.0831484545402992,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7173,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 1.065442394577919,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7634,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 1.176715798585199,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8176,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.9816781183969229,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.6896,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 1.099486782417909,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8123,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 1.0148183908798116,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.7322,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.9799797820366946,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7213,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 1.0340760098203616,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.703,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.5978607470498314,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.6617,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 1.088949176532953,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.8486,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 1.0390771666964085,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7702,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.9454467561447718,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.6682,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 1.0688599597247057,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8487,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.8988151969453305,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.6799,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.9681952414826085,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.6717,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 1.1719052599277335,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.8016,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.954039432416863,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.6911,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 1.2131581008392034,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.8597,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.9789998124089584,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.6927,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.9877726243104934,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.6464,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 1.0150532508929966,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7407,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 1.0882168566410513,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.724,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.982861951878359,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7284,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 1.0030001487449376,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.7069,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 1.0280008372899851,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7594,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 1.0962550866233343,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.8523,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 1.0457328345364145,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.6922,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 1.4036703444178804,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.8643,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.9554017750179316,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.697,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.9682042596566531,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.7175,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.9415174317375208,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.6513,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 1.2727514684350927,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.8116,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.9921199460736623,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.66,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.0137544087850203,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.7267,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.974035983194552,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.6646,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 1.088398496009498,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.7733,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.9967354379387562,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.6388,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.9526052437876638,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.6986,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 1.0289683091704833,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7501,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.9462458656763387,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.7066,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 1.0437208398768452,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7938,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 1.0343520846176977,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.7036,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.9577644708974145,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.6413,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 1.1006775477218305,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.8025,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 1.0151788021543564,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7577,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.9937451275659185,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7137,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 1.0350026345426953,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7823,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.9108555752247096,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.735,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 1.0668111431297043,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7714,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.912021860911979,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.6831,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.9905284496676833,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7561,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 1.1232141052409121,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.77,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.9931298258308252,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7114,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 1.1089171732957042,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.7296,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 1.1286852814468344,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7465,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 1.1190139468756013,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.6236,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 1.1483830524013459,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7324,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 1.2212551541061174,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.8115,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.090068717757821,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7191,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 1.1085791446066866,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.8004,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 1.0113359039302352,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.6248,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 1.0964367246355113,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7408,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 1.0301462814934836,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7247,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 1.0739566603530641,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7033,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.8807320293892692,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.6746,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.9793213144526057,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.7779,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.9415126280388529,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.6607,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.9909061915628322,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.714,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 1.00513563363099,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.756,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 1.1245792806635877,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.7553,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 1.1950916131345375,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8932,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.9932996847458989,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.6877,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 1.248713647763005,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.9275,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 1.2215744887666666,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.8051,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 1.0512964840297354,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.6843,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.9928744107754482,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.6953,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 1.105947283350789,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7055,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.9883961136937218,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.6444,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 1.0207447755812524,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7367,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 1.0094331506812593,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.8237,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 1.0138108530230707,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.703,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.928129814825029,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.6776,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 1.0288030369711199,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7732,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.0640848613747005,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.777,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 1.0528999133328119,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8037,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.9653114458714556,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.6779,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 1.0262546590838455,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7933,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.9164158735787065,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.6451,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 1.066701177172031,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7566,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.9439582191068443,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.6703,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.9459146283947549,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.6672,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.9981667462424716,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.6923,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 1.0875381794581624,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7673,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.8762075795160603,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.5637,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 1.0604507702776595,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7326,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.9143594094269025,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.6037,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 1.0636858738692445,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7581,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.9318846711345694,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.6136,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 1.0384488393986446,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.729,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 1.0477460175382347,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.8111,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 1.105830244673777,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7597,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.9834120376978399,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.6362,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 1.098162845193706,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.725,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 1.012436854456873,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.7046,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.9760021946383537,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.6925,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.9382328441194895,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.6759,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.9760909397548315,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7327,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 1.1133709538310124,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.697,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.933158293206893,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6591,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.8610989939996339,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.5716,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.9861142706518471,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.6708,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 1.0257131017794683,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7431,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 1.0198752411317464,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7411,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 1.0409890244815698,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.7328,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.988010259209258,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.6229,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 1.1042524286135829,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.6528,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 1.167309119109961,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.6666,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 1.3041398533142197,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.6798,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 1.0865400113033143,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7536,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.8555479713966215,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.5758,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 1.0568143499880747,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7081,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 1.0162822170661614,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.6428,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 1.0359036973974367,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7316,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.9560884895362259,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.653,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 1.1544322689744408,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7334,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.953809912936542,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.6703,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 1.0444013863707968,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.6968,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.9447882428261958,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.68,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 1.0831435406919236,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6961,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 1.1363438282806402,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7148,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 1.0696012560299293,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7761,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.9333342558350548,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.6054,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.9033082485068732,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.6315,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.0930337368883383,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7495,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 1.081838167819764,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7293,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.9923722852152731,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.6889,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.9196648489839756,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.6371,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 1.2048809375107268,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.8533,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.9673963449218785,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.6977,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.9703411130239993,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7172,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.9373760822707912,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7073,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.994147007240893,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7076,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 1.0746978834189227,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7495,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 1.052331627756332,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.6775,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 1.0672546588497585,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7034,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 1.215223998177791,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.7603,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.9440701837765714,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.6243,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.9438729501972922,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.6566,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 1.0654804345994373,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7174,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 1.1763623753804215,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7637,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 1.0741594802453314,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7241,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.9675989080151941,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7439,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.9636647510459591,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6479,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 1.0087841611490806,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.7383,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.9818867802966742,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6549,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 1.0355078040973305,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.7541,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.9802743835120142,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.6655,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.971194464938782,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7107,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.1220475607440976,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6531,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 1.0605304536118139,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.6269,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 1.2204345511388361,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7026,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.9738217416134146,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.5849,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.9441774928753534,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.5969,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 1.12389339746799,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.7606,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.9304351293542036,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.639,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.9565046885589732,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.6884,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 1.205634227332287,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8178,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 1.0188207226357398,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.7172,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 1.0540021733325038,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.6787,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.9948112159703907,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.6498,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.8836323531769209,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6339,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.9576548443778085,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.6895,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 1.1102043397230095,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7384,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 1.1351565388435783,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.7097,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 1.146994910537703,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7317,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.9918991649557216,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.6544,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 1.198648658812263,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8527,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.9626362344342084,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.6238,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.9499452817998587,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.6527,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 1.0658101866983538,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.8236,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.8084895765521102,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.5902,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.9690490833614454,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.7191,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 1.008697111330277,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7917,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.9646953454012074,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.6793,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.9653443273926454,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.6591,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.9179052874207917,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.6613,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 1.0562144373472704,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.6906,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 1.0950071511884267,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.7611,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 1.0185041599966833,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7104,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 1.0044446718738724,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.6764,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.9120289651783,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6452,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.9606647124420223,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.671,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 1.0245320681964516,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.654,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 1.6576495465222114,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.847,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.9865908610386174,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.6993,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 1.1295837213180702,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7228,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.948216062157724,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.6986,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 1.0487720133192266,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.6659,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 1.0532693653317589,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6925,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 1.1645235432270866,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.7907,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 1.0657274676274178,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7278,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.9943155432977927,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.6269,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 1.1107776604547468,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7161,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.9734125226125735,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.6816,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.9992083479961862,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6643,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 1.0558325256562415,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.7538,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 1.0821258580451227,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7827,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 1.0243746286607833,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.7297,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.9717240585903687,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.624,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 1.0315755463193517,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.72,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.9457788209819127,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6104,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.8622451847398616,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.5773,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.9170339040279909,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.6697,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.9309659564189371,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.6061,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 1.0139811893861588,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6063,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.9565559184221922,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.5972,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 1.5548048389718108,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.5764,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 1.0799981251001614,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.6595,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 1.049770262215874,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7363,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 1.0325936012889165,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.6885,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.8625772450786451,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6287,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.9112822988052758,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.5691,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 1.0632388018056593,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6636,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 1.029535711215482,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.6906,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.9591614110692922,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6314,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.8949330694021034,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.5899,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.9153773857789488,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6051,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 1.0849278669728846,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.7544,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 1.142813464120941,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7313,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 1.001543961930351,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.6699,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 1.0425623116876044,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.5833,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 1.050040552652407,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.6643,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 1.1424081378005362,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7225,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.1312337168049746,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.6858,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 1.0679202158442578,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6776,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 1.067151691225152,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.6363,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.9431028592814902,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6697,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.9549602521283611,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.7064,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.9529789295668323,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7105,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 1.0965618097118262,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.7246,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.9811034508234098,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7697,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.956274166666771,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.6784,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.9798468248476327,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.6797,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 1.0375968663501707,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.6382,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.9662694880416806,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.6941,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 1.0410310257756477,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.701,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 1.069424086283332,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8443,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.973476345898565,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.644,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.9690990691047512,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7045,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.878145219403551,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.6742,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.9063044481911675,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.641,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 1.001203019256862,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.7444,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.9110717667370313,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.6417,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.8802273740541856,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.6553,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 1.0995829480405193,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.769,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 1.0338274485826533,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.6832,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 1.0007253517744736,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7447,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.971200354777103,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.6645,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.1064379377231044,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7071,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 1.0282297754784526,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.7565,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 1.0170495081001916,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6728,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.981573108226441,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.6613,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.9612999332629785,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.685,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 1.1495625174354223,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7439,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 1.0625147958003944,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7942,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.8656809536333867,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.62,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 1.0814638376729038,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6849,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 1.0136592426883342,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.7686,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 1.0653979211052498,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7356,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 1.5524672487774989,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.7126,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 1.2407551782976203,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7939,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.9613723023205167,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.6928,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 1.0449124354890031,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7082,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.9340571369355848,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.6169,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.9700227307466612,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6793,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 1.0780444177733026,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.6701,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.9821818841705071,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.643,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.9814401515031396,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.6788,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 1.0926726449940525,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7499,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.9902456615356826,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.5531,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.9813485221150713,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.6782,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 1.0354255517380966,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.7147,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 1.0282080376059664,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7793,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.9748715142365796,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.6833,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 1.040452534646341,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7646,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.9523916277181167,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.7967,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 1.0329630631418163,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7531,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 1.1285527698971287,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.7493,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.9190585160356773,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6446,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.9082981214968774,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.5956,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.9444245855283278,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.654,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 1.0689970782226674,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7062,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 1.0546696322924753,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.683,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.9971647907901832,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.6792,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.9892976751183552,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6402,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.8936332990317126,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.6351,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.925522606737151,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.5876,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.992246902537022,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.6892,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 1.0045201876739385,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.6797,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.90811240999659,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.6378,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.9385694690761545,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.616,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.7917103910922637,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.5023,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.870431902242613,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6534,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.8823717609207367,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.6739,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.8265835296319374,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6082,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.9405103526779204,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.6384,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 1.024733649321451,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7039,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.9268780097595519,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.6524,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.8781618247231748,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.5848,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.9290248266795664,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.712,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.9193603267840847,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6238,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.9482222228787937,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.6282,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.8862451690628104,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6242,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 1.1310323019857198,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.6774,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 1.2015874953589551,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.798,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 1.0066583588082008,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.5747,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 1.2487930533796985,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7472,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 1.0197015980249613,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7325,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.958331423601618,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6506,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 1.0849774317040684,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7727,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.9955829303680328,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6899,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 1.0325170766691616,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7209,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.9962515122153125,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.6045,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.9424554738981582,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.6261,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.9008228342261408,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.6281,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.9795315779902153,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.669,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.9181733443905542,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6354,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.9964838418025049,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.606,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.9713730027331944,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6361,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 1.0127955889148383,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.661,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.8965977593232317,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.5803,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.9555255268081055,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.67,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 1.429931874393292,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.8253,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.9080529225486037,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.5789,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.9561747406233209,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6863,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.8795515505847675,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.6141,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 1.1153897316373964,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7158,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 1.0239509952378902,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.6895,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 1.0255634523001187,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7033,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 1.0779694730191423,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.7482,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.9033055174636703,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.5817,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.9234135553682853,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.6561,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 1.0930306682285496,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7402,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 1.0891783333532241,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.838,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.9063121785903229,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.5588,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.8756515092653024,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.5871,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.9052639166165494,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.628,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.9403490201939707,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.6514,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 1.0477290724502009,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.5836,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.9100473365039066,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.6018,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.9016002157479855,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.5607,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.9850404704925205,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.6702,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.9749236728270172,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.5877,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.9399585491083582,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.6119,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 1.0947518739913256,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.8437,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.8320013335953632,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.6518,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.9468846118303392,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6035,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.9159399662224321,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.6246,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9676794539813114,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.5911,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.9252959644110134,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.6548,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.8685234570929422,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.5444,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.999888802188266,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.6613,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.9258630185529607,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7428,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.8978351873148683,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.5685,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.8867834326441457,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.5201,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.8236581805329797,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.5255,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.9372515761391587,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6118,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.8860205977781878,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.6414,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 1.0121532447470982,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7489,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.95363120804436,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.6106,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.9254678867628731,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.5573,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.9674567730705677,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.6123,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.929134356753628,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6086,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 1.0062273044589374,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.7352,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.856612433286552,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.5335,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.9013543780447304,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.5392,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.9990058368463038,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7409,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.9993198048944072,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.6626,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.8561173649083798,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6048,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.8483751866663903,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.6365,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.9456170963528224,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.5824,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.9046667406999749,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.5922,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.8750038332233276,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.567,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.8584138388306187,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.5831,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 1.0390547317146437,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6902,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.9994915776953593,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.6477,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 1.0770129680588285,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.5568,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 1.0941378840305556,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.6017,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 1.048632007629162,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.622,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.9955624104751872,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.6559,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 1.0259937315294585,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7529,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 1.0051406872664894,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.6572,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.8437661707095883,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.5758,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.9068740628413372,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.5819,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 1.0071286102184436,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6693,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.8846472732604066,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.536,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 1.1775969933001442,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7716,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.8841109595513913,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.591,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.845641593117227,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.587,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.9093340586120714,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.6042,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.9160365891428205,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6342,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.8473297542447655,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.5884,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.8494577589658796,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.8867037444326773,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.6235,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.990154130055683,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.5907,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 1.0263388140013465,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.6316,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 1.1271972521050353,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.5525,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 1.0837906964394204,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.6587,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.0043765323916656,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6208,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.8366245514174349,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.4771,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.918646383048879,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.5758,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.9064290959922201,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.5214,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 1.004774070235783,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6805,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.9282402680467122,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.5931,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 1.0751875949022203,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6883,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.9337831027390505,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.5999,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.8738135695779778,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6364,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.8801911032071335,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.5952,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.9823176815317364,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6753,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.8879311198249467,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.5486,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.8535088345660301,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.5567,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.9407793728029339,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.674,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.916554270033362,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6477,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.9613974681796721,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.593,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 1.1380334060060602,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.5568,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.9110819234222967,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.6018,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.9152424635634194,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6375,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.8918539267738597,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.5792,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.9025975880738885,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6599,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.840404349020553,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.5668,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.9320459313831757,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.5902,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 1.050896801767698,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.7424,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.8374864129093207,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.4816,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.0697389056145188,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.6934,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.874226541591921,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.5467,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.9217989977358017,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.6351,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 1.107297223197541,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7024,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 1.0106730056854827,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.5998,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.8501042546831452,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.5953,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.9052200769081522,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.5746,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.8894211309263867,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.5901,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.922607707471488,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.6184,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.8862217002560381,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.612,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.9911155918338997,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.611,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.7657420492665408,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.5286,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.8997367303235172,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.577,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.8319139015288592,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.5347,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.9457280659526005,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.6461,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.9271953572324194,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6099,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.9260935366306379,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.5267,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.9528620793950472,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.5478,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 1.0044560988345654,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7119,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.8135108163191106,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.5216,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.8980222891717288,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.576,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 1.0477335377395407,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6906,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 1.0547626667854606,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.656,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.8798438501542369,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.5354,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 1.0117863162265472,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.6,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.895018639948916,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6008,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.9580031145377335,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.6302,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.9021281611904125,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6171,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.9785185862912812,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.6775,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.843433757105957,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.5161,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.9059987007232777,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.5804,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.9850582512568193,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.8949371449296837,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.5907,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.7637689981060631,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.5015,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.9173008428956848,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.5879,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.9345140097350884,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6243,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.946836398596421,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.6282,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.8998445459421743,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.5566,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.9022030377922716,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.5598,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.9471302548173,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.5697,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.9315863652553878,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.6164,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 1.0700682801215697,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6168,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 1.0606049920115181,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.64,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.8285105075180479,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.5487,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.8752142319351934,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.5347,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.9651658763970603,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.4671,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.9657917378841041,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.6231,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.9435597035462718,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6741,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.8761344875784453,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.4783,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 1.0357740548151377,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6813,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.8533213245126312,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.5665,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.8780721704318892,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.5007,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.9449061063034562,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.6092,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.9716800570034508,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6404,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.8939295826894426,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.6126,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.9111497105991285,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.545,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.9845780649515906,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.6521,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 1.060729966646695,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6937,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 1.1137329565177583,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.6442,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.9795416162775744,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6429,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.9101660654997451,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.518,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 1.029837582749723,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.5742,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.8829773480744334,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.5556,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.9135755633321601,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.5405,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.9503980034000901,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.5779,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 1.0232350032188056,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.624,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 1.0241056751401028,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.5954,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.9520941462332658,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.5899,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 1.1016754977518257,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.6661,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 1.1279637263444664,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.8222,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 1.022109420265956,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.5703,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 1.0138360617776458,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6399,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.9042528295182145,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.5849,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.9197630436567867,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.5542,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.8072283928049032,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.4602,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.9247334266158035,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6019,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.8401822746345465,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.5639,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.9648310078029783,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6491,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.9603080004347097,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.6523,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.9347051369082151,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.596,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.9923423082065466,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.611,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.9035717517346512,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6214,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.9806422813540163,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.6559,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.8358362273716006,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.5557,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.9762546938398565,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.7442,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.8457968285899077,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6362,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.9867394684229117,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.6626,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.9804817061905197,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.5673,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.8686750808478814,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.5164,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 1.0739535691613076,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.974910792051444,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.5915,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.9274313889916219,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.5763,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 1.1337383279689464,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6324,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.8722391993884088,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.5442,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.9008678609241617,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.5032,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.8786081409518408,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.5224,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.8782753024763686,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.5512,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.9485361902342623,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.468,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.9980592476102427,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.6164,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.890415820394609,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.582,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.0930709924661823,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.6374,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.8602461742143718,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.5258,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.9049447130718785,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.615,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.8478137147670415,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.5523,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 1.0428062444573134,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.6102,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.9060803547799261,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.5192,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.8671851045560147,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.5236,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.9030872650059336,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.5564,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.8891999761941229,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.4803,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 1.2175211199100915,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.606,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.9223325697404515,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.6114,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 1.1518593501565233,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.5726,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 1.00007730967823,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6566,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.9727693231297316,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.5765,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.9033874622473923,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.6042,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.9458238352865332,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.5683,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.8562750584506673,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.5072,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.9794396912129939,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6185,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.8850179136959749,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.5479,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 1.179241015400573,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.781,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.9272477221458543,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.5702,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.9009021563821138,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.561,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.8581708176413048,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.5186,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.8723976354383418,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.5318,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.7431457385938055,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.4181,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.8837427101900246,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.5868,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.8579725182366548,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.5344,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.9071593023553052,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.5674,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.8253343827701816,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.5168,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.8347321820495228,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.4898,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.8580588005870989,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6004,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.9068280468472921,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6366,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.8725133877344808,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.5617,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 1.1267908594103957,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.665,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.9734516620486875,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.6219,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.9183237827735163,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.5942,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.9320508265907027,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.6137,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.8649942159597835,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.5499,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.8939880978861915,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.5474,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.7997400615731897,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.5049,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.897826974097443,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.5178,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.9137494525351595,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.5506,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.8968322331004761,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.5621,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.8935469934960261,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.5297,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.9802607196989127,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.609,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.9244342711411846,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6323,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.9921448739093076,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6659,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.8082608559518543,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.4815,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.8853742615880639,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.5958,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.9318274124318456,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.5057,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8440535946971125,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.5626,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 1.0839068704747066,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6763,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.9503143284762061,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.5612,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.8985459420279938,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6194,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.9588031538158126,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.5763,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.9903749264660486,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.5527,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 1.0545286776848906,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.629,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.9187204254406824,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.5336,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.9074774869812791,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.5807,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.9354272316349254,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.603,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.8621968567745608,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.5029,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 1.0503686217853245,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6687,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.8087437735035757,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.5049,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 1.0286699396273984,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7108,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 1.1222870059539025,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.5778,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.8682346562913431,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.5769,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.8970792816397353,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.5973,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.8758641896709833,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.541,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.8193765480762364,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.4439,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.9642302230497448,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6231,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 1.202530953279566,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.7005,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.8186776036576376,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.5625,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.8509909294818085,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.5207,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.8630572024442712,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.5531,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.9776376629053878,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.6213,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8327183472419162,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.5478,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.9629843450121315,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.6228,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.9428958922382763,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6306,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.8766919189818014,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.5252,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.9607715277505974,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.5826,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.8811357710516505,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.558,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.8633162238313675,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6029,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.7826786876638742,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.4493,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 1.1234018311636582,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.5906,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.8133381724130317,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.4849,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.8811594930337688,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.5501,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.9625523729130937,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.602,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 1.7004748721610694,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7473,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.9115527765393678,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.5729,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.8633236144748906,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.5243,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.861669670026954,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.4855,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 1.0431346413694014,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.648,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.918967588419062,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.5108,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 1.010661153516246,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.5907,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.8450664077888524,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.4896,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.9923031127222196,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6135,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.9288854200687368,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.5613,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.9522743141061653,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.4701,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 1.0094695453286113,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.58,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.970218865338825,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.5926,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.790855904118202,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.4653,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.8422660876340917,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.4801,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 1.0014831126614723,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.4986,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.9900852217587683,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.5694,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.9417817304764583,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.5908,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.9192656513426533,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.561,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.9390210428837307,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.5637,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.8023042426447257,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.5082,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.994269753573064,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.5859,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.9581278006089728,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6368,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 1.157025545060076,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.731,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.9026081678125137,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.574,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.8511295626883457,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.5233,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.9605202807866595,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.5892,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.9941106462464011,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6674,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.958307096804603,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.5351,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.8848714279388494,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.6071,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.9124009475249037,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.4971,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.8170349079531632,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.4853,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.9168135684559577,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.5256,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.8958868424097945,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.595,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 1.0322136965161413,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.625,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.8686088201111917,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.5213,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 1.397943370653368,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7105,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.849284154839858,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.471,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.9651700194122222,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.5713,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.8461510681085123,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.526,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.8872125095643726,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.5451,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.7725367210449859,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.5168,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.9731750885863505,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.5905,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.8575890240241985,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.5402,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.9310603758600113,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.5713,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.9486072925333384,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.5536,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.8796289974349658,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.473,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.904832093813838,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.5644,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.9959967053012221,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6113,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 1.0035413750150226,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.5597,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.7522128103248988,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.3973,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.9559089106196158,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6254,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.9082540088018486,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.5922,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.9209653753383338,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.5191,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.9753363807773336,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6246,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.9128765043266641,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.5684,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.9795183066919234,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.627,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.9370464016613507,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.5785,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.8661804985123592,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.4774,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.9275349767601161,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.5361,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.9776163224048302,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6501,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.9684150776770701,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6016,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.8536286255083416,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.4827,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.8488705178074374,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.5068,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.9653390506845506,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.5196,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.8436991002384736,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.4656,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.8749823412974033,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.5662,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.8985882406835242,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.5482,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.9630164324990867,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.5153,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.7646616715144958,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.4989,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.9902478452601741,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.586,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.7945357744020473,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.49,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.9437580839050794,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.5464,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.9000794532132054,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.5479,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.8248876467754059,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.4422,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 1.0134703940851104,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.5729,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.8799946140730117,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.5678,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.8065366798841783,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.4753,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.9475335105937116,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6077,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.7754114695147104,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.4338,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.8374808007484986,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.4497,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.8323538409441401,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.5003,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.8992981567160874,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.4931,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.9523785259083702,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.545,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.9028532965536356,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.5643,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.8329661181095044,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.5065,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.9045489579885994,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.5754,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.8122236201667463,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.4596,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.9458700607662385,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.5422,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.9400045938705994,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.5691,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.950119666470634,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6084,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.8266852543982652,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.5058,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.9171482494675368,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.5265,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.9129964408049166,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.4856,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.970371319237002,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.5786,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.938108920869477,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.5396,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.8324834459051568,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.5321,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.9720401613921223,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6282,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.9211477035330666,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.5049,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.9114536236330752,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.5723,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.869206743166981,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.5251,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.9131960209724165,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.5253,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 1.0325033567001431,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.5184,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 1.0711695560219818,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.4953,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.79993043009904,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.4594,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.9076938969046555,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.5813,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 1.1954518554339275,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6984,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.9553192337296545,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.645,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.8824602104943458,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.5727,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 1.1307467443908437,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.7457,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.8714489356501227,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.5797,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.8727769203474613,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.5305,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 1.012849393772381,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6181,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.9295677357269199,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.5982,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.96562031519255,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.4983,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.8730122804461445,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.5385,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.9789972079391175,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.5423,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 1.010970133723809,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.5722,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.9170646823952904,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.5465,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.9973204900340932,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.6262,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.8259919276621404,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.4667,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.8937375710756645,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.5263,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.8914255511382069,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.5328,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.964531191188055,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.6234,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 1.016152490851934,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.5976,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.9408720151506826,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.5926,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.8491116729945938,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.5053,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.8801389581367419,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.4839,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.9189757810625665,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.5776,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.8679856769713041,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.5109,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.8653361527703157,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.4714,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 1.2154684681960608,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.5409,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.8117989635671922,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.4881,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.8933660238435036,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.4991,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.8805551533380775,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.556,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.9076112378541764,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.5162,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.7783752852786534,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.4355,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.9681238878174612,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.6387,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.986067456670194,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6401,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 1.1616887354158458,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.6503,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.8356171170071415,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.46,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.8234938113671183,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.4529,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.9021931268838739,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.619,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.8805150998519309,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.5648,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.8766414205095304,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.4507,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.8963944790833713,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.5129,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.9674164869308384,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6082,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.9387142620377266,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.5818,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 1.0476174430955054,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.5424,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.778305736612384,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.4626,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.8066049464477373,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.484,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.7361159517063306,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.4208,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.7861135477198569,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.4816,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 1.1101606366000765,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.8143,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.8571834353244482,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.4639,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.8812710706618156,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.5343,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.9011119020017496,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.5465,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.7150605871766601,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.3887,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.934933015144909,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.5632,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.814265512837061,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.513,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.8898108833293623,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.4953,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.7878727211367245,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.4097,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.8262094779399805,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.5031,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.8602059461423265,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.5065,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.9102039847676353,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.5793,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.8593913559563465,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.5173,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.9612889504391398,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.5509,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.8199863353032216,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.4669,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.8683319349314995,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.4832,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.8853586682877709,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.5853,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.870653065272409,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.5657,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 1.0360095669560587,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.5844,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.9127764534167723,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.5158,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.8324285302416667,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.4904,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.9126213682993818,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.5612,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.9377897981379643,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.5324,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.9267564340290367,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.54,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.8885418872921662,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.5605,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.9220925719860372,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.5891,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.8917359520594882,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.5343,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.856374299549514,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.4947,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.8472421331309948,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.4742,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.9059799615295063,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.5649,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.903380145241749,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.5267,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.838159360823987,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.4785,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.95426650020417,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.6238,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.9835303491327146,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.5337,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.9085320670809434,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.5523,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.9402108229444758,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.5777,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.8033901405681682,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.4331,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.8483623278184083,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.4641,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.8382782289402538,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.5495,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 1.0653563377438948,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6996,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.8526435347765418,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.4201,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.9173974064653941,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.5011,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.9118765944555586,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.5349,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.8523726711246717,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.4843,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.8310938443735266,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.4752,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.8160240253137404,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.4935,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.9133439238402501,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.5186,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 1.2257899135753538,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7849,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.8941487901942106,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.511,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.8927617816033743,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.5277,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.8569616628261307,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.4934,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.8623307013532433,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.4872,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.8339219712284653,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.4717,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.8919463963392258,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.5271,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 1.038351660842284,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.5991,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.9113680210072598,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.5128,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.9405512171624507,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.581,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 1.0139304340195732,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.5738,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.8329899919177766,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.4126,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.8290302795588016,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.524,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.831940504501025,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.5042,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.7654111092715857,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.3579,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.8185108283565805,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.5127,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 1.2175561725900603,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.4866,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.9256230720577836,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.575,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.8594777460193576,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.5353,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 1.0304680086366294,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.551,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.9025342564934193,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.4693,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.9341684848617188,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.4824,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.9461860263023557,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.5283,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 1.2350611078400762,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.5681,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.7474778193812881,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.3756,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.9464908649387499,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.6277,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.9416513453191021,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.4862,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.8908592829855512,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.4729,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.8856623091540741,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.4909,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.9979093631514518,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.6267,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.9396388164146111,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.4944,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.9349642997890655,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.5823,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.9302060251593333,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.5565,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 1.0068653034108261,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.6292,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.8308900215815783,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.558,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.8291497760442449,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.4724,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.837140049991436,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.4343,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.8206278559411827,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.522,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.768654648097681,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.3869,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.9008291992190718,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.4763,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 1.004138999783533,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.5653,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.8866055795866293,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.5081,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.8712416614852868,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5173,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.8352905774940463,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.5085,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.8664854906477405,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5379,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.8545854637602966,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.4901,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.8792443966755817,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.4969,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.8275160725349503,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.5005,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.9555667842829724,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.5709,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.8893416025085475,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.5108,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.9198927290446861,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.4771,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.9765388094416342,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.5807,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.9779341822780092,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6186,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 1.0534004761522051,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.5374,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.845348850963641,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.4122,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 1.104942550352276,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.4983,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 1.0475255602464242,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.5859,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.9522886093773041,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.4994,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.7025830954078448,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.3753,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.8969366965782625,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.4291,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.9794017411293363,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.5851,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.8854550965461073,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.5149,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.823691828844835,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.4734,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.7818438263006319,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.4146,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.9426024371050443,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.4801,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.8749080560869444,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.4793,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.8676696472413886,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.4645,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.8960028684930892,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.4813,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.841782397347136,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.469,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.933131225290578,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.5261,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.8684117605795351,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.4353,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 1.0568779961110095,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.5826,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.9697568087755433,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.5972,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.8828340754449046,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.4705,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.9568532376050941,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.5999,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.8968564158876486,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.494,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.9443142557280755,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6001,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.9579343285914728,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.5305,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.8775488211919319,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.507,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.9473459034568387,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.5673,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.8299701680297237,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.4673,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.868648290322588,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.5002,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.7270428842077594,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.4236,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.8074750039881929,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.4342,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.9026392457184746,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.4882,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.9283783800439218,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.4729,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 1.0460686665759698,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6621,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.9080809068007054,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.4621,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8947614683641865,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.5121,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.8655363229396491,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.4584,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.8968007468514912,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.5095,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.9905380754651123,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.5721,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.9825103431981693,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.5568,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.9732696070377225,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.5385,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.8911017162323968,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.486,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.8415838751932674,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.4727,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.9298508591052215,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.4488,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.951180288226257,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6119,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.9594481967706358,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.522,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.8216569948534782,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.4426,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.8764995815135124,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.4885,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.8669171775122492,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.4792,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 1.0636096340620473,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.5597,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.9666529676562569,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.5385,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.975179351024621,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.5263,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.9728368420681078,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.4715,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.8307393168249513,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.4706,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.933506651180675,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.487,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.9003621002918134,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5217,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.9934422138613694,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.5806,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 1.1353093338542166,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.5983,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 1.0046597135773063,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.5806,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.9049654952924316,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.4912,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.9797372066358172,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.5539,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.8191030872242712,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.4482,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 1.0361003460873626,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.5914,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.9631844884063604,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.5133,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 1.0839124306252106,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.5904,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.9087621324174773,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.4718,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.8537357892822648,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.4538,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.8868391059398126,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.4907,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.9849941050555328,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.536,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.8566136171382083,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.5163,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.9598154308582632,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.5125,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 1.1024945081202426,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.5302,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.9417202792081554,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.5601,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.8954980066354414,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.5416,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.999982721740313,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.5495,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 1.0111231909200546,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5273,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.9274679387425223,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.5113,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.9598365132710229,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.4457,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 1.0479412201817522,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.6291,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.9306949568961659,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.5439,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.856405806668651,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.4288,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.9681397336561196,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.477,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.9783902754901848,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.5747,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.7718073981075178,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.3934,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.9365851525522183,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.5427,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8946986907015603,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.4729,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.9712126805966191,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.5563,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.9914126134977888,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.5616,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.9915353472321419,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.6006,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 1.021875288392754,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.5085,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.9575275995276004,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.5485,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.9815525486849324,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.524,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.8472755247959326,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.5045,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.9110161852107633,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.4986,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 1.1735288736114748,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.7224,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.987447662445318,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.4973,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 1.0488609031326457,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.4996,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 1.014047500109056,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.5138,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.9588783339157702,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.5101,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 1.035740452465745,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.5993,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 1.1102270687547424,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.6304,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.7831216938523164,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.4387,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.9117675424261213,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.5163,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 1.1329774437267308,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7174,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 1.067893713503416,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.6084,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.9006368268680338,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.5008,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.9708569171881641,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.5353,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.8938517118108713,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.4912,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.9424813688217818,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.5371,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.9506715977030524,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.5418,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.0195314451194224,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.5502,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.9360909013448883,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.4926,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.8925123180992532,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.5075,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.9373120885129952,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.5522,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.9045565462595742,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.4455,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.9496806607012137,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.4764,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 1.2070971555248793,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.5305,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.9884453096239335,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5564,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.925262765183813,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.5431,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.8650475451010552,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.4824,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.9375027024745487,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.5334,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 1.0258138994156334,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.566,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 1.1348295457470585,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6353,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.8099381398300214,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.5236,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 1.1335763733398434,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6208,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.8417059804666327,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.4808,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.8740279745856687,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.4519,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.9025682701839401,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.4474,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.9356977884254927,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.4785,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.7786241335150642,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.4236,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.904185452995062,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.5729,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 1.01166300380065,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.42,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.8484776609022155,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.4598,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.8555465620019524,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.515,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.9853659491986522,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.5119,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.8368195146029167,
+      "learning_rate": 0.0,
+      "loss": 0.4741,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 165935961923584.0,
+      "train_loss": 0.6441711540222168,
+      "train_runtime": 10384.4138,
+      "train_samples_per_second": 1.926,
+      "train_steps_per_second": 0.12
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 165935961923584.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca2de9b5033ee8244cdadf276aafa1f753a8b6f5
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3845c8b530bfcf80fe75196425957db77bed50fc
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfb0e40d889acd8f4b66ac3dd43335936c109f7b563dc264e53aa62df3f03b4e
+size 671150064
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aab918516733b2284e4bb7680bb2a5ae6091d811
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80fd6743c026c9eca2d229e104ed0a0e0541174078c4f04e1cd24aeeb01bb209
+size 918507402
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..364f874fe4df900e5b5711dbb0efc161f99683ac
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 4.619897607575414,
+      "learning_rate": 5e-05,
+      "loss": 1.4867,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 4.886720880463927,
+      "learning_rate": 0.0001,
+      "loss": 1.5103,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 2.0826440735967027,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3063,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.7883162219744941,
+      "learning_rate": 0.0002,
+      "loss": 1.0129,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.424535127131405,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.0793,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 2.6477025684213955,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 0.9522,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 2.383868971479716,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.0021,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.6100909214247756,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.8346,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 1.3081346305473396,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.8299,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.7441102778418505,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 1.0063,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 1.2701284480164619,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.8531,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 1.736304860214383,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.8597,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 1.4341392622612212,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.7976,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 1.3844496084613491,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.9177,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.4694767893133263,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.885,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 1.1939356484363937,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.873,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 1.4401907008611985,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.9079,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 1.3988894591269279,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.9131,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 1.2454329527509385,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8773,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.255418146967074,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8735,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 1.1560074136431318,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.7798,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.291681536643461,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.8684,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 1.3674424850401692,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.7092,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1.2999036173310123,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.7637,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.2391085483231103,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.7795,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 1.5082519625603912,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.8217,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 1.3850556089113835,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.818,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 1.1935139075969265,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.7651,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 1.1508390722152042,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.8159,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.2770011363549434,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.8071,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 1.068067124408631,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.7454,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 1.3051298462722027,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8419,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.9798996910610092,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.7307,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 1.142833058756903,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.7414,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.145518711497527,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8124,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 1.0614283304633985,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.7862,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 1.094667484989183,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.7924,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 1.1491934862922923,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.8699,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 1.1268283363130298,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.7633,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.043542678445131,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.7574,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 1.092441023312135,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.7096,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 1.1601589532727679,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8354,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 1.1353600206932608,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.7407,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 1.0503363091620095,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.7131,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.1275262381443367,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.7309,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 1.0900400445912137,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.7705,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 1.0836608138977326,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.7584,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 1.0680998853596113,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.7439,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 1.0990957659179,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.7257,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.0225040423604024,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.6854,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 1.1031078059651571,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.7601,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 1.0970372956505092,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7407,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 1.2912652706234626,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.8306,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 1.0303770662924296,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.7283,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.1838484089311536,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.7869,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 1.1996508231205614,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.8304,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.9973023772472317,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.7435,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 1.1394855232486916,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.8035,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 1.0023205132345805,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.6697,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9649329991466065,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.7329,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.9120142833559911,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.6681,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 1.0448187897096388,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.7906,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.9139072472140527,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.6626,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 1.1574772800929347,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8664,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.0220362084048196,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7479,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 1.199094525413627,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.8719,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 1.000008577603346,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.7416,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 1.060537149329608,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.7707,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 1.1371266027976432,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.769,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.1350763178425805,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7817,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 1.0218141814745094,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.737,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.9202080674794947,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.6632,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.9296699980814602,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.6548,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 1.0335725615091327,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.6948,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.141947553762206,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.7382,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 1.4100203818688208,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.8474,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 1.0163008682743984,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.6651,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 1.221529976174952,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.8131,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.9618984836374378,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.65,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.0996286151580774,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.8285,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.9268846013697055,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.6435,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 1.027211457290476,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.745,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 1.082529517207295,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.7019,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 1.0135468128062057,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.7226,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.97001184986676,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.8256,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.9740270101491069,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.6412,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 1.0372568205011428,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.6513,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 1.1221621988691768,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.6919,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 1.0375063434620733,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.7526,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.0174249016341808,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.6827,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.9085677543812799,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.6304,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.9208505244485531,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.7146,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 1.0697826972171396,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.9217,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.968128643452508,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.6088,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.0269110861575053,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.7421,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 1.0747124471237177,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.7495,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 1.023035458249502,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.7403,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.8812897998547033,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7222,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 1.0027482226792357,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.6957,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.0890324102494235,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.758,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 1.1577158487469124,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.6337,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.9448443729492736,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.6452,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.9259142261290009,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.6566,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 1.0542917132855005,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.7358,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.091275350594823,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.693,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.9786531655966347,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.6642,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 1.1278517232346554,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7668,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 1.0005845602335413,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.7357,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.8841528934669991,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.5717,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.3521091334370579,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.6707,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.8846307307259047,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.5618,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 1.0703643421078397,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.7374,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.9746740913726185,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.6961,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.9414487770868957,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.7156,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.9304329120448038,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.6689,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 1.0025867854859485,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.6602,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.9348458350132222,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.6133,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 1.0271198817911413,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.6659,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 1.048864398380103,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.6276,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.101718976429846,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.8493,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.9521334208542822,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.6474,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.9276337231350391,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.6515,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.9611591344997523,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.6451,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.8927926010295639,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7052,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.9124525841463956,
+      "learning_rate": 0.0,
+      "loss": 0.5924,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 16496717570048.0,
+      "train_loss": 0.7736291184425353,
+      "train_runtime": 1040.5743,
+      "train_samples_per_second": 1.922,
+      "train_steps_per_second": 0.12
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 16496717570048.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..347647564608ef24ccb20498950dac46c08d01cf
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..702fc23be7e5a1d7e9d91980a80968c974b1cff0
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a35f458620f5a287a03c955f6acb741a079e64971521b1dcbaf8c8a942f08430
+size 671150064
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b0ea064a30190425bfde06794dccea4671c42329
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cf0e54bb1000e80eb51dae2ebfed904d13a5dd56cdef499b7980b2999dd6532
+size 918507402
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a06b06d0464b81930b9129812fe5d7b88f0152bb
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/trainer_state.json
@@ -0,0 +1,1792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 4.674744838529102,
+      "learning_rate": 2.5e-05,
+      "loss": 1.4867,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 4.9256407021066675,
+      "learning_rate": 5e-05,
+      "loss": 1.5103,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 2.5677944230432193,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 1.4188,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 2.2529029118150716,
+      "learning_rate": 0.0001,
+      "loss": 1.1377,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.047964065232966,
+      "learning_rate": 0.000125,
+      "loss": 1.0446,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.7555120313603354,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.9269,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 2.134531403067591,
+      "learning_rate": 0.000175,
+      "loss": 1.0221,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.665748281051102,
+      "learning_rate": 0.0002,
+      "loss": 0.8459,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 1.4759575846757216,
+      "learning_rate": 0.0001999915737775817,
+      "loss": 0.7934,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.6304033303566767,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 0.9748,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 1.3110558409884459,
+      "learning_rate": 0.00019992417251814282,
+      "loss": 0.8587,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 1.7201492275174837,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 0.847,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 1.5758552031912056,
+      "learning_rate": 0.0001997894154323911,
+      "loss": 0.7981,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 1.433500573442223,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.9366,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.4217505783095623,
+      "learning_rate": 0.0001995873933559535,
+      "loss": 0.8733,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 1.2003920444262448,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.8634,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 1.6665877835121163,
+      "learning_rate": 0.0001993182424657285,
+      "loss": 0.9127,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 1.341015054535535,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.91,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 1.2273257645062086,
+      "learning_rate": 0.0001989821441880933,
+      "loss": 0.8859,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.2894468326209563,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.869,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 1.1469297491500698,
+      "learning_rate": 0.0001985793250766098,
+      "loss": 0.7831,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.2871033572214328,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.8706,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 1.470258548075832,
+      "learning_rate": 0.00019811005665931205,
+      "loss": 0.7019,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1.2810881750965715,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.7606,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.4072344720956307,
+      "learning_rate": 0.0001975746552556772,
+      "loss": 0.7905,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 1.7180156821685193,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.8396,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 1.367099869066617,
+      "learning_rate": 0.0001969734817634044,
+      "loss": 0.8305,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 1.2888961724874664,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.7905,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 1.1765669030550359,
+      "learning_rate": 0.00019630694141514464,
+      "loss": 0.8319,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.1841530286472177,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.8116,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 1.0679725826738797,
+      "learning_rate": 0.0001955754835053459,
+      "loss": 0.7553,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 1.2219448462839118,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.8355,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.9711904551107217,
+      "learning_rate": 0.0001947796010873974,
+      "loss": 0.7263,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 1.1075790710725923,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.7483,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.278006549263634,
+      "learning_rate": 0.0001939198306412775,
+      "loss": 0.8164,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 1.1536165558422753,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.7936,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 1.1865654471702436,
+      "learning_rate": 0.0001929967517119289,
+      "loss": 0.8048,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 1.2343222074360047,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8808,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 1.247526068607731,
+      "learning_rate": 0.0001920109865186052,
+      "loss": 0.7913,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.1852742286804527,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.7706,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 1.1325327330263366,
+      "learning_rate": 0.00019096319953545185,
+      "loss": 0.7215,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 1.3582324850496053,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.8489,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 1.2041545764445587,
+      "learning_rate": 0.00018985409704360456,
+      "loss": 0.7603,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 1.038114948366513,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.7286,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.1182206003231439,
+      "learning_rate": 0.00018868442665510678,
+      "loss": 0.7306,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 1.0303890087751462,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.7666,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 1.1911443097100909,
+      "learning_rate": 0.00018745497680896722,
+      "loss": 0.7796,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 1.2016413454646881,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.7697,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 1.1822272055096672,
+      "learning_rate": 0.0001861665762396974,
+      "loss": 0.736,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.0983349187588152,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.6999,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 1.845199079764097,
+      "learning_rate": 0.00018482009341868697,
+      "loss": 0.8069,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 1.276303709369214,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.7438,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 1.3944781099073964,
+      "learning_rate": 0.00018341643596879367,
+      "loss": 0.8595,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 1.0632222354851693,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.7425,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.2263074444516069,
+      "learning_rate": 0.00018195655005254273,
+      "loss": 0.8127,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 1.2149536699256438,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8457,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 1.0138171981919357,
+      "learning_rate": 0.00018044141973434758,
+      "loss": 0.768,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 1.2061520303740656,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.8194,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 1.0823588512569589,
+      "learning_rate": 0.00017887206631718203,
+      "loss": 0.6896,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9904390215800949,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.7558,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 1.0142633742867952,
+      "learning_rate": 0.00017724954765415137,
+      "loss": 0.7038,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 1.1916449607132265,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.8259,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 1.0649736202924283,
+      "learning_rate": 0.00017557495743542585,
+      "loss": 0.7195,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 1.2212720973523508,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8927,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.1324306359339695,
+      "learning_rate": 0.00017384942445101772,
+      "loss": 0.774,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 1.2347954497560891,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.9011,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 1.0730716533928852,
+      "learning_rate": 0.00017207411182989832,
+      "loss": 0.7699,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 1.3567009841762714,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8084,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 1.222724537355913,
+      "learning_rate": 0.00017025021625596853,
+      "loss": 0.8189,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.3196168838771813,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8392,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 1.1216400372050441,
+      "learning_rate": 0.0001683789671614107,
+      "loss": 0.78,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.9974595631972965,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.6931,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 1.043035145267162,
+      "learning_rate": 0.00016646162589796615,
+      "loss": 0.7029,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 1.1202125958554328,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.7526,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.124759105227289,
+      "learning_rate": 0.00016449948488669639,
+      "loss": 0.8025,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 1.1386547403182927,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.8874,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.9988928534548326,
+      "learning_rate": 0.00016249386674680184,
+      "loss": 0.7016,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 1.2940115313890033,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.8591,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 1.0315854341464359,
+      "learning_rate": 0.00016044612340408466,
+      "loss": 0.6896,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.109559558392284,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8844,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 1.115080148205073,
+      "learning_rate": 0.00015835763517965673,
+      "loss": 0.6882,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 1.1027828326759102,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.8085,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 1.1463616078622434,
+      "learning_rate": 0.0001562298098595078,
+      "loss": 0.7364,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 1.1244532000255234,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.781,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.011475799895498,
+      "learning_rate": 0.00015406408174555976,
+      "loss": 0.8474,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 1.0240992575029322,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.7006,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 1.0456985935846164,
+      "learning_rate": 0.00015186191068884775,
+      "loss": 0.7034,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 1.038941847745798,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.7024,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 1.0874554195537363,
+      "learning_rate": 0.00014962478110547918,
+      "loss": 0.7905,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.0707901639706023,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.7269,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.9715185980838664,
+      "learning_rate": 0.0001473542009760343,
+      "loss": 0.6711,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.9913081098330498,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.7484,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 1.1232772391486232,
+      "learning_rate": 0.0001450517008290827,
+      "loss": 0.9143,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 1.0817504121616406,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.6649,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.0723926535058825,
+      "learning_rate": 0.00014271883270950073,
+      "loss": 0.7912,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 1.0967417138149718,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.7739,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 1.1162816331090357,
+      "learning_rate": 0.00014035716913228568,
+      "loss": 0.7969,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 1.0409425412997686,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.7703,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 1.1944482073063658,
+      "learning_rate": 0.0001379683020225714,
+      "loss": 0.7718,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.1102093067375531,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8045,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 1.2340044538914852,
+      "learning_rate": 0.00013555384164256048,
+      "loss": 0.6847,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 1.0344458272008699,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.6817,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 1.0626120506094445,
+      "learning_rate": 0.00013311541550609565,
+      "loss": 0.7149,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 1.1218393616590492,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7871,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.1197044459523418,
+      "learning_rate": 0.00013065466728160252,
+      "loss": 0.7283,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 1.0062988279593645,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.7224,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 1.1044521804118692,
+      "learning_rate": 0.00012817325568414297,
+      "loss": 0.8117,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 1.0391694322668483,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.7874,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.9011177202476485,
+      "learning_rate": 0.00012567285335732633,
+      "loss": 0.6216,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.033983706569211,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.7362,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 1.0371409551989839,
+      "learning_rate": 0.00012315514574583113,
+      "loss": 0.6164,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 1.1031708450635234,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.7865,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 1.0278240722304612,
+      "learning_rate": 0.00012062182995929882,
+      "loss": 0.7084,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 1.0344356319609227,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.7945,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.9526235778560432,
+      "learning_rate": 0.0001180746136283638,
+      "loss": 0.7011,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.9870993105336784,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.689,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 1.0630454299198857,
+      "learning_rate": 0.00011551521375359206,
+      "loss": 0.6496,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 1.0109882559812657,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.6951,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 1.1715016413417012,
+      "learning_rate": 0.00011294535554810354,
+      "loss": 0.6781,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.2052720292878927,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8652,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.9966996386903084,
+      "learning_rate": 0.00011036677127465889,
+      "loss": 0.6521,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.9951818796352864,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.7029,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.9368117058127309,
+      "learning_rate": 0.00010778119907799398,
+      "loss": 0.6698,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.9318100336553307,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.7318,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.0983885346285733,
+      "learning_rate": 0.00010519038181318999,
+      "loss": 0.6213,
+      "step": 125
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.7638747532597451,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.3842,
+      "step": 126
+    },
+    {
+      "epoch": 1.016,
+      "grad_norm": 0.7793981616830645,
+      "learning_rate": 0.00010259606587086783,
+      "loss": 0.4023,
+      "step": 127
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.6663909865377676,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.3208,
+      "step": 128
+    },
+    {
+      "epoch": 1.032,
+      "grad_norm": 0.8427465859741629,
+      "learning_rate": 0.0001,
+      "loss": 0.3975,
+      "step": 129
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.7866238132594401,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.3673,
+      "step": 130
+    },
+    {
+      "epoch": 1.048,
+      "grad_norm": 0.8616426327296077,
+      "learning_rate": 9.740393412913219e-05,
+      "loss": 0.3719,
+      "step": 131
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.9456553559591211,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.3657,
+      "step": 132
+    },
+    {
+      "epoch": 1.064,
+      "grad_norm": 0.9086242195831249,
+      "learning_rate": 9.480961818681004e-05,
+      "loss": 0.3225,
+      "step": 133
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 1.0747694362727902,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.3502,
+      "step": 134
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 1.0596031466336995,
+      "learning_rate": 9.221880092200601e-05,
+      "loss": 0.3133,
+      "step": 135
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 1.1388934802519084,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.3266,
+      "step": 136
+    },
+    {
+      "epoch": 1.096,
+      "grad_norm": 1.1907787848999825,
+      "learning_rate": 8.963322872534114e-05,
+      "loss": 0.38,
+      "step": 137
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 1.1833920892982426,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.3654,
+      "step": 138
+    },
+    {
+      "epoch": 1.112,
+      "grad_norm": 0.932565927788482,
+      "learning_rate": 8.705464445189647e-05,
+      "loss": 0.3348,
+      "step": 139
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 1.056948343348815,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.3325,
+      "step": 140
+    },
+    {
+      "epoch": 1.1280000000000001,
+      "grad_norm": 1.1116858198987372,
+      "learning_rate": 8.448478624640797e-05,
+      "loss": 0.3399,
+      "step": 141
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.9487153714016062,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.2648,
+      "step": 142
+    },
+    {
+      "epoch": 1.144,
+      "grad_norm": 0.987507461623801,
+      "learning_rate": 8.192538637163621e-05,
+      "loss": 0.2793,
+      "step": 143
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 1.0489433838473108,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.3665,
+      "step": 144
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.0030902972919105,
+      "learning_rate": 7.93781700407012e-05,
+      "loss": 0.3552,
+      "step": 145
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.8904106592566092,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.2711,
+      "step": 146
+    },
+    {
+      "epoch": 1.176,
+      "grad_norm": 0.9706926884829528,
+      "learning_rate": 7.684485425416888e-05,
+      "loss": 0.3373,
+      "step": 147
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.9389335945233177,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.3055,
+      "step": 148
+    },
+    {
+      "epoch": 1.192,
+      "grad_norm": 0.9872080557866764,
+      "learning_rate": 7.432714664267373e-05,
+      "loss": 0.3001,
+      "step": 149
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.9094267656601626,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.2747,
+      "step": 150
+    },
+    {
+      "epoch": 1.208,
+      "grad_norm": 1.5104293946560028,
+      "learning_rate": 7.182674431585704e-05,
+      "loss": 0.4515,
+      "step": 151
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.9503009794649055,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.3233,
+      "step": 152
+    },
+    {
+      "epoch": 1.224,
+      "grad_norm": 0.8142713969471524,
+      "learning_rate": 6.934533271839752e-05,
+      "loss": 0.2838,
+      "step": 153
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.8935891149236612,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.267,
+      "step": 154
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.2671981198957942,
+      "learning_rate": 6.688458449390437e-05,
+      "loss": 0.3911,
+      "step": 155
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.9857137845834811,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.3482,
+      "step": 156
+    },
+    {
+      "epoch": 1.256,
+      "grad_norm": 0.8992426112369553,
+      "learning_rate": 6.444615835743955e-05,
+      "loss": 0.276,
+      "step": 157
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.8690897102819473,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.2812,
+      "step": 158
+    },
+    {
+      "epoch": 1.272,
+      "grad_norm": 1.0592691387703361,
+      "learning_rate": 6.203169797742861e-05,
+      "loss": 0.3428,
+      "step": 159
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.1066763115172018,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.3572,
+      "step": 160
+    },
+    {
+      "epoch": 1.288,
+      "grad_norm": 1.01859697464143,
+      "learning_rate": 5.964283086771435e-05,
+      "loss": 0.3142,
+      "step": 161
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 1.1019021725049405,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.4041,
+      "step": 162
+    },
+    {
+      "epoch": 1.304,
+      "grad_norm": 0.9615876490141757,
+      "learning_rate": 5.728116729049928e-05,
+      "loss": 0.3092,
+      "step": 163
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.9397205153627258,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.3022,
+      "step": 164
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 1.1842470031836703,
+      "learning_rate": 5.4948299170917325e-05,
+      "loss": 0.3963,
+      "step": 165
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 1.029661373530674,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.33,
+      "step": 166
+    },
+    {
+      "epoch": 1.336,
+      "grad_norm": 0.8341974846050139,
+      "learning_rate": 5.26457990239657e-05,
+      "loss": 0.2993,
+      "step": 167
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.9528163807665674,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.331,
+      "step": 168
+    },
+    {
+      "epoch": 1.3519999999999999,
+      "grad_norm": 0.8127656540497246,
+      "learning_rate": 5.0375218894520834e-05,
+      "loss": 0.27,
+      "step": 169
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.815368869193228,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.3123,
+      "step": 170
+    },
+    {
+      "epoch": 1.3679999999999999,
+      "grad_norm": 0.9341793171721062,
+      "learning_rate": 4.813808931115228e-05,
+      "loss": 0.3241,
+      "step": 171
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.8406613326246486,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.2474,
+      "step": 172
+    },
+    {
+      "epoch": 1.384,
+      "grad_norm": 0.9014918983900241,
+      "learning_rate": 4.593591825444028e-05,
+      "loss": 0.2941,
+      "step": 173
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 1.052110080300373,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.3315,
+      "step": 174
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.943872054660999,
+      "learning_rate": 4.377019014049223e-05,
+      "loss": 0.3199,
+      "step": 175
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 1.0067114411249958,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.3588,
+      "step": 176
+    },
+    {
+      "epoch": 1.416,
+      "grad_norm": 0.8629665177436001,
+      "learning_rate": 4.164236482034327e-05,
+      "loss": 0.2606,
+      "step": 177
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.8343441015042404,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.255,
+      "step": 178
+    },
+    {
+      "epoch": 1.432,
+      "grad_norm": 1.048342053248487,
+      "learning_rate": 3.9553876595915375e-05,
+      "loss": 0.3247,
+      "step": 179
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.9714346890332698,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.2662,
+      "step": 180
+    },
+    {
+      "epoch": 1.448,
+      "grad_norm": 0.9744546187297268,
+      "learning_rate": 3.750613325319817e-05,
+      "loss": 0.2609,
+      "step": 181
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.992170075556931,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.2967,
+      "step": 182
+    },
+    {
+      "epoch": 1.464,
+      "grad_norm": 0.9207170612761674,
+      "learning_rate": 3.550051511330361e-05,
+      "loss": 0.2764,
+      "step": 183
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.987847549650391,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.2718,
+      "step": 184
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.9576257344728829,
+      "learning_rate": 3.3538374102033866e-05,
+      "loss": 0.3137,
+      "step": 185
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.8814189665237778,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.2281,
+      "step": 186
+    },
+    {
+      "epoch": 1.496,
+      "grad_norm": 0.9917324983590853,
+      "learning_rate": 3.1621032838589305e-05,
+      "loss": 0.2718,
+      "step": 187
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.9523929333349113,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.3222,
+      "step": 188
+    },
+    {
+      "epoch": 1.512,
+      "grad_norm": 1.3965148113166408,
+      "learning_rate": 2.974978374403147e-05,
+      "loss": 0.3282,
+      "step": 189
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 1.0212998172537535,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.3197,
+      "step": 190
+    },
+    {
+      "epoch": 1.528,
+      "grad_norm": 0.9320938412848552,
+      "learning_rate": 2.7925888170101665e-05,
+      "loss": 0.2568,
+      "step": 191
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 1.0038773999662451,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.3048,
+      "step": 192
+    },
+    {
+      "epoch": 1.544,
+      "grad_norm": 1.067071275673017,
+      "learning_rate": 2.6150575548982292e-05,
+      "loss": 0.2752,
+      "step": 193
+    },
+    {
+      "epoch": 1.552,
+      "grad_norm": 1.106070124000284,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.3095,
+      "step": 194
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.011696313071652,
+      "learning_rate": 2.4425042564574184e-05,
+      "loss": 0.2762,
+      "step": 195
+    },
+    {
+      "epoch": 1.568,
+      "grad_norm": 0.9236563136547,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.2892,
+      "step": 196
+    },
+    {
+      "epoch": 1.576,
+      "grad_norm": 1.0255063848944046,
+      "learning_rate": 2.2750452345848682e-05,
+      "loss": 0.3077,
+      "step": 197
+    },
+    {
+      "epoch": 1.584,
+      "grad_norm": 0.9782082431449886,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.2852,
+      "step": 198
+    },
+    {
+      "epoch": 1.592,
+      "grad_norm": 0.8326643844084437,
+      "learning_rate": 2.112793368281799e-05,
+      "loss": 0.2342,
+      "step": 199
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.5900666578407399,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.3175,
+      "step": 200
+    },
+    {
+      "epoch": 1.608,
+      "grad_norm": 1.0190552209754948,
+      "learning_rate": 1.9558580265652448e-05,
+      "loss": 0.3258,
+      "step": 201
+    },
+    {
+      "epoch": 1.616,
+      "grad_norm": 0.8932354211021565,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.2457,
+      "step": 202
+    },
+    {
+      "epoch": 1.624,
+      "grad_norm": 1.0784167744092246,
+      "learning_rate": 1.804344994745727e-05,
+      "loss": 0.3324,
+      "step": 203
+    },
+    {
+      "epoch": 1.6320000000000001,
+      "grad_norm": 1.2975319869241422,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.3326,
+      "step": 204
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 1.0722622793694678,
+      "learning_rate": 1.6583564031206357e-05,
+      "loss": 0.2942,
+      "step": 205
+    },
+    {
+      "epoch": 1.6480000000000001,
+      "grad_norm": 0.9268908762977555,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.2885,
+      "step": 206
+    },
+    {
+      "epoch": 1.6560000000000001,
+      "grad_norm": 0.975540228005638,
+      "learning_rate": 1.5179906581313064e-05,
+      "loss": 0.3475,
+      "step": 207
+    },
+    {
+      "epoch": 1.6640000000000001,
+      "grad_norm": 1.2947356002550057,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.3367,
+      "step": 208
+    },
+    {
+      "epoch": 1.6720000000000002,
+      "grad_norm": 0.9969017308710312,
+      "learning_rate": 1.3833423760302611e-05,
+      "loss": 0.2799,
+      "step": 209
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 1.105832602470686,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.3545,
+      "step": 210
+    },
+    {
+      "epoch": 1.688,
+      "grad_norm": 1.008270175667529,
+      "learning_rate": 1.2545023191032801e-05,
+      "loss": 0.3062,
+      "step": 211
+    },
+    {
+      "epoch": 1.696,
+      "grad_norm": 1.047066942797903,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.3305,
+      "step": 212
+    },
+    {
+      "epoch": 1.704,
+      "grad_norm": 1.084646274515253,
+      "learning_rate": 1.131557334489326e-05,
+      "loss": 0.3136,
+      "step": 213
+    },
+    {
+      "epoch": 1.712,
+      "grad_norm": 0.9947790374688665,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.3364,
+      "step": 214
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.8940746388885817,
+      "learning_rate": 1.0145902956395447e-05,
+      "loss": 0.3075,
+      "step": 215
+    },
+    {
+      "epoch": 1.728,
+      "grad_norm": 0.8710247100820544,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.2787,
+      "step": 216
+    },
+    {
+      "epoch": 1.736,
+      "grad_norm": 0.8808640303308677,
+      "learning_rate": 9.036800464548157e-06,
+      "loss": 0.2776,
+      "step": 217
+    },
+    {
+      "epoch": 1.744,
+      "grad_norm": 0.9007324930455055,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.2931,
+      "step": 218
+    },
+    {
+      "epoch": 1.752,
+      "grad_norm": 1.3360245358851073,
+      "learning_rate": 7.989013481394814e-06,
+      "loss": 0.3635,
+      "step": 219
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.8662064286590857,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.2964,
+      "step": 220
+    },
+    {
+      "epoch": 1.768,
+      "grad_norm": 1.0915529365558145,
+      "learning_rate": 7.003248288071118e-06,
+      "loss": 0.3648,
+      "step": 221
+    },
+    {
+      "epoch": 1.776,
+      "grad_norm": 0.9202789725816855,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.2857,
+      "step": 222
+    },
+    {
+      "epoch": 1.784,
+      "grad_norm": 0.9318116552878593,
+      "learning_rate": 6.08016935872251e-06,
+      "loss": 0.2843,
+      "step": 223
+    },
+    {
+      "epoch": 1.792,
+      "grad_norm": 0.8494920756416293,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.2712,
+      "step": 224
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.8797892336415777,
+      "learning_rate": 5.22039891260262e-06,
+      "loss": 0.2812,
+      "step": 225
+    },
+    {
+      "epoch": 1.808,
+      "grad_norm": 0.8760823018514744,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.2645,
+      "step": 226
+    },
+    {
+      "epoch": 1.8159999999999998,
+      "grad_norm": 0.9572331290852502,
+      "learning_rate": 4.424516494654118e-06,
+      "loss": 0.2643,
+      "step": 227
+    },
+    {
+      "epoch": 1.8239999999999998,
+      "grad_norm": 1.0584565777951243,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.2744,
+      "step": 228
+    },
+    {
+      "epoch": 1.8319999999999999,
+      "grad_norm": 0.9829527904030954,
+      "learning_rate": 3.693058584855369e-06,
+      "loss": 0.3071,
+      "step": 229
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 1.0086221717790416,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.3151,
+      "step": 230
+    },
+    {
+      "epoch": 1.8479999999999999,
+      "grad_norm": 0.9037186453744863,
+      "learning_rate": 3.026518236595621e-06,
+      "loss": 0.2567,
+      "step": 231
+    },
+    {
+      "epoch": 1.8559999999999999,
+      "grad_norm": 1.0019997239355016,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.3082,
+      "step": 232
+    },
+    {
+      "epoch": 1.8639999999999999,
+      "grad_norm": 0.882894828622119,
+      "learning_rate": 2.4253447443228106e-06,
+      "loss": 0.2677,
+      "step": 233
+    },
+    {
+      "epoch": 1.8719999999999999,
+      "grad_norm": 0.8996565449715206,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.2796,
+      "step": 234
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.9089579248416909,
+      "learning_rate": 1.8899433406879608e-06,
+      "loss": 0.2515,
+      "step": 235
+    },
+    {
+      "epoch": 1.888,
+      "grad_norm": 0.9685412253870014,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.2778,
+      "step": 236
+    },
+    {
+      "epoch": 1.896,
+      "grad_norm": 0.8996738035578177,
+      "learning_rate": 1.4206749233902084e-06,
+      "loss": 0.2506,
+      "step": 237
+    },
+    {
+      "epoch": 1.904,
+      "grad_norm": 0.9475958329418473,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.2694,
+      "step": 238
+    },
+    {
+      "epoch": 1.912,
+      "grad_norm": 0.8885136392291223,
+      "learning_rate": 1.0178558119067315e-06,
+      "loss": 0.3436,
+      "step": 239
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.8280334345400172,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.2436,
+      "step": 240
+    },
+    {
+      "epoch": 1.928,
+      "grad_norm": 1.1115438568448008,
+      "learning_rate": 6.817575342714988e-07,
+      "loss": 0.2731,
+      "step": 241
+    },
+    {
+      "epoch": 1.936,
+      "grad_norm": 0.8211660429326406,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.2471,
+      "step": 242
+    },
+    {
+      "epoch": 1.944,
+      "grad_norm": 0.8498742119952564,
+      "learning_rate": 4.126066440464982e-07,
+      "loss": 0.2561,
+      "step": 243
+    },
+    {
+      "epoch": 1.952,
+      "grad_norm": 1.0168713887533058,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.2906,
+      "step": 244
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.9691850112649262,
+      "learning_rate": 2.1058456760891798e-07,
+      "loss": 0.3031,
+      "step": 245
+    },
+    {
+      "epoch": 1.968,
+      "grad_norm": 0.9259385068622227,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.2394,
+      "step": 246
+    },
+    {
+      "epoch": 1.976,
+      "grad_norm": 0.9391835901797269,
+      "learning_rate": 7.582748185719358e-08,
+      "loss": 0.2998,
+      "step": 247
+    },
+    {
+      "epoch": 1.984,
+      "grad_norm": 1.0287335651544172,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.2866,
+      "step": 248
+    },
+    {
+      "epoch": 1.992,
+      "grad_norm": 1.0000330802426152,
+      "learning_rate": 8.426222418311814e-09,
+      "loss": 0.3155,
+      "step": 249
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.0246789859149088,
+      "learning_rate": 0.0,
+      "loss": 0.3072,
+      "step": 250
+    },
+    {
+      "epoch": 2.0,
+      "step": 250,
+      "total_flos": 32940892651520.0,
+      "train_loss": 0.5541693149805069,
+      "train_runtime": 2079.3998,
+      "train_samples_per_second": 1.924,
+      "train_steps_per_second": 0.12
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 32940892651520.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a88111f02050b3f7366d4ed8a16f98ab5418837
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ae265615db8c3b07f89e7c4a0db61fc6a2a02412
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4375ea8b2c76c461b0cf4df969503d14eef4bc3ada7e63ff213d15486fda9e6e
+size 671150064
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508fe400ff01b922df556ef0dc69306574677869
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a1255eca8de6f9224f95a6ba5909cfd0ebefaf1d5ab548019e86fb7d0f3121b
+size 918507402
diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..64ba627bbd9831927d2eb27f733c50e194d890e9
--- /dev/null
+++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 3.8494427382984977,
+      "learning_rate": 2e-05,
+      "loss": 1.5206,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 4.452605449564811,
+      "learning_rate": 4e-05,
+      "loss": 1.5109,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 2.2827512458595445,
+      "learning_rate": 6e-05,
+      "loss": 1.3696,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.9005110908633476,
+      "learning_rate": 8e-05,
+      "loss": 1.1741,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.4910329120321058,
+      "learning_rate": 0.0001,
+      "loss": 0.9923,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.88035086322046,
+      "learning_rate": 0.00012,
+      "loss": 0.9127,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 2.042098191915081,
+      "learning_rate": 0.00014,
+      "loss": 0.9648,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.7233770983581165,
+      "learning_rate": 0.00016,
+      "loss": 0.9461,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 1.449197633443149,
+      "learning_rate": 0.00018,
+      "loss": 0.836,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.2900359918295659,
+      "learning_rate": 0.0002,
+      "loss": 0.876,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.4852099226013014,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.8284,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.3889814325040792,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8337,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.5064544417143808,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9325,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 1.3764770861409006,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.7912,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.407685959814556,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.8235,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 1.419892646905105,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9167,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 1.2542459367172696,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.7894,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 1.5053470324097624,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9018,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 1.4266643857980994,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8374,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.4513379128350314,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9682,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 1.437867162646637,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.802,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 1.2646231736967548,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.7591,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 1.2760649437008011,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8405,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 1.3244564743716525,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8056,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.3781336157488413,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.7843,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 1.2349066076612103,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8032,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 1.4471220778165244,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8885,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 1.1706339840369515,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8026,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 1.2842245444466585,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.7821,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 1.1574449208863158,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.7997,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 1.4393641501848962,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8374,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 1.1022637704133273,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.691,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 1.2390416489286467,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8228,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 1.2723065693593631,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9256,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 1.3320103811934423,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9903,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 1.1082008197953372,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.7964,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 1.198571527552164,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.788,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 1.0499439896228493,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.6915,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 1.2787423292037434,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.8774,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 1.1042952212305208,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.7451,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 1.1149949790665101,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7537,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 1.237976361978822,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8782,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 1.3004540665627777,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8582,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 1.1000708833200137,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.7909,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 1.1899967921766514,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.8335,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 1.2054207735640328,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8692,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 1.1050589236248065,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7355,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 1.0872751210429186,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.7336,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 1.1511449947711034,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.7859,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.1307539265986024,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.7765,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 1.351090330595727,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.7897,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 1.2975204595562546,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7415,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 1.1163670705147442,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.7743,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 1.2755707068662836,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.7988,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.2702567625694303,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.8539,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 1.1163092203414966,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7748,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 1.0293590185248311,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.7143,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 1.4409398453864177,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 1.0833,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 1.1706574644576464,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.8348,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1.135414367069039,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8813,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 1.079296304758334,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8135,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 1.1043937803678643,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8187,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 2.1270436164977844,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.66,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 1.219994157328429,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8427,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 1.2125200585844553,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8415,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 1.1001073708321165,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.6918,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 1.194540673785202,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8302,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 1.1654655298759733,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7777,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 1.0960669927086184,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7228,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 1.3072226786043357,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.745,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 1.0376194288323604,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.7159,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 1.1329808368309582,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.793,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 1.1858370493394028,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.7923,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 1.1937400981515296,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.9004,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.1762178456447314,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.768,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 1.207471247637441,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8273,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 1.169346276818301,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.8023,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.9837697261481548,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7058,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 1.1967491234936953,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7353,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 1.059535525619828,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7919,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 1.0669095062974419,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.701,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.9427635478779778,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.5928,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 1.0750208025455008,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7733,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 1.0277897136233543,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7672,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 1.0599838033481415,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7549,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 1.1699655604824146,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8371,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.9071262053759483,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.6607,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 1.0435436605605914,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7598,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.9531556376560218,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.6814,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 1.0526100850960158,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.7612,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 1.0884276174032113,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.8,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 1.0135538049398993,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7234,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 1.053611711654421,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7816,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 1.0192932237511134,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.6904,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 1.2401048947247897,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.8299,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 3.083913619287634,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7016,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 1.1696294945463654,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7838,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 1.1109483737471186,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.731,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 1.059952420722134,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7391,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.9655078300817889,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.6989,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 1.0540407412485533,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.797,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 1.0698672980989976,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7769,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 1.112817505418569,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.8024,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 1.072425620863512,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7873,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 1.056488530423442,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.6963,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.9885609012700043,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.6829,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 1.0493733552659685,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.6784,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 1.230162801686097,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.736,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 1.0120333723067219,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.6967,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.9661599057340882,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.6997,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 1.058761827010355,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7332,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 1.1232621172663235,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.6784,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 1.2271609398143184,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8297,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 1.0165493420787164,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7142,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.858774180611481,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.6662,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 1.0431358133949369,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7564,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 1.1210394667165413,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7709,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 1.1097745247659903,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8047,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 1.0081865656866769,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.6768,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.9203647511384035,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.6705,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.994380717363222,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.6535,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.9326117765014191,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.6857,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 1.0065656737007167,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.6634,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 1.260262573325505,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7611,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.1440674766124106,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7246,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 1.1768333911491151,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7046,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 1.1989107764043931,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.7431,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 1.0665368519092593,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7049,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 1.0963729026160156,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.737,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 1.1174308562968942,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.821,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 1.1514901189931928,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7987,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 1.1622485278068753,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7098,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 1.0955540600948737,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.6526,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 1.0457876787355673,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7341,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.9877483882289841,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.6878,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 1.1182496551780474,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7395,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 1.0452123422985957,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.6148,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 1.0767028709731223,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7519,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 1.1318399991187422,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7297,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 2.753142555925865,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.6695,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 1.0276528201896862,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.6798,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 1.0831750543575958,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7158,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 1.002228184109368,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7234,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 1.0616413597198417,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7657,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 1.0898849037474774,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7213,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 1.0783972954908356,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7423,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.8991306321972456,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.5727,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 1.0158886111527536,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.6369,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.9976456938631907,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7267,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.105515497623268,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.8043,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.947098123978281,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.6139,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.9067278980666293,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.6185,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.9798295874045432,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.647,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.9494198844289646,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.6033,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.9658328400277845,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.6627,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 1.0409284666705174,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.6509,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 1.0605420636323952,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7419,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.9931102063466744,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.6379,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 1.0758646517408224,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7947,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 1.0607905274771352,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7912,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 1.0749429491379305,
+      "learning_rate": 0.0001,
+      "loss": 0.6665,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 1.0274907638555637,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7552,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 1.0269983555780358,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7295,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 1.0172077288402206,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.6356,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.9488263777741074,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.6731,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.9280411234896112,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.6565,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 1.0865275056210568,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7646,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.9676804385852837,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.6872,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.9715346008017007,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.6078,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 1.040258403190395,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7776,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 1.3488643904138622,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.962,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 1.0101420289903322,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.6972,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 1.0487242372617993,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.6946,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.9580188922300577,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.6509,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.9729345979989085,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.6879,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.9019650238052146,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.5875,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 1.0697398651008763,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7303,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.9617653135515002,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.6529,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 1.188799963147903,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.743,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 1.1346778862709697,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7093,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.9423687094682542,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.6348,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.972756373128702,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.6293,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.9423401625896834,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.6321,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.8782322342238923,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.5851,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.950614076585,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7006,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 1.0217147617291982,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7017,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 1.0530995450285776,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7241,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.9777035014050306,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.6389,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.9143916182392459,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.6462,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.8374075204133037,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.5759,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 1.0167524821100296,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7093,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 1.1410797162196067,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.6992,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.9723149428461653,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.699,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.9347966751909983,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.6143,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.9459430624777844,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.5846,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 1.006635068656351,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.6846,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 1.0868346637391972,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7714,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 1.0001561725192702,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.5994,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.9630366831759528,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.6332,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.9990161510199028,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.5778,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 1.0270025834274374,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7001,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.9797614392174078,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.552,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.8856936306713705,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.5838,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.9824172956948665,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.6261,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.9484220610341958,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.6504,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.9594657155142843,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7055,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.972240224378151,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.7094,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.8488682826787697,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.5163,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 1.0290117142316402,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.6177,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.9570819786081641,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7241,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.9374896149693231,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.6133,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.9213351808810384,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.6202,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.9280034382542609,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.6877,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.860741711039809,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.5419,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.827205240779305,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.532,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.9609370356067223,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.5443,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 1.054834291417378,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6386,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.8127636855002158,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.5077,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 1.2734576666636381,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.6817,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.9137566241882511,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.6076,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.943976185714207,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.6374,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.9240301677365691,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.6086,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.9608932693214731,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.5893,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.9109456902151709,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.5603,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.054931654692335,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.6465,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 1.0060225278548545,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.6534,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.9301145221677807,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7283,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.9425878421394381,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.5308,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.968421456129294,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6099,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.9190664904011367,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.664,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.9544868911319989,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.6063,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.9538656246451748,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.5914,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 5.864599498064909,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.5884,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.924638663719702,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.5661,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 1.077927200421442,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7104,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.9126341698953319,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.5792,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.9167560621645063,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6753,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.8934477881360058,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.5331,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 1.0911526373889162,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.5562,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 1.0372617919862839,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.6061,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 1.874872012362247,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6462,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 1.002221190667952,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.5881,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.964781181057697,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.5915,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.9627190399764589,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6914,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 1.017981397065761,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.706,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 1.012164734208218,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6131,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.8897086839324428,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.5776,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 1.0174871061711546,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7699,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.9409135368695652,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.56,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9684884037872186,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.5933,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.8685648466866711,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.5562,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 1.0087704494348668,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.6087,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.8961312204544799,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.6149,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.9365625630552173,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6084,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.9702115890508202,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6264,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 1.1242270439776727,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.697,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.9099287195259446,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.5553,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 1.0095969089859835,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7007,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 1.159962294378398,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.6413,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.86927957380752,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.5238,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.9693712197844089,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.6028,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.9843252719096152,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.615,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.8901672647222229,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.5222,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.9651282196979104,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6034,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.8990217271577625,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.5897,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.9311848638173101,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.6126,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.917504640800634,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.588,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.9633897678979783,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.6615,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 1.00795740870038,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.5764,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.8508045166113912,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.5191,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 1.047343602543878,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.6884,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.8558631508335752,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.5093,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.9472568352001958,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.6343,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.9440682841311664,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7094,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.0202250496053065,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.6833,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.9520439938838217,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6359,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 1.094476040858008,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6435,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 1.0048329225203338,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.6153,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.9456091943259975,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.6306,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 1.4770809130718134,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.9741,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.974923751267628,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.6106,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.9459621761058625,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.5571,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.8957426369513457,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.5007,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.9364274853309743,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.5761,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.9165602428300078,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.5077,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.9834989066154466,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6162,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 1.1292517278477647,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.6428,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.9337817865286436,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.5469,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.9912221780816013,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.5816,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.9106967657243581,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.5833,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 1.07092147689949,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7294,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.9219668633777998,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6053,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 1.7589766769110666,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7696,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.8972215579228963,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.5035,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.9047492331034355,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.5227,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.9475427562480715,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.5762,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.9811638817803725,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.5815,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 1.0020873846705007,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.538,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 1.1067755275823017,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.6765,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8715046732367977,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.4914,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 1.0634576890292167,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.6705,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.8549848401831835,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.5987,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 1.012508590543694,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.659,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.9693468696989235,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6589,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.9666706891053034,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.6538,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.8992506419411962,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.5815,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.9571595303459484,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.6566,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.9173568643648345,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6204,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 1.0748041207141892,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7146,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.9754095477814513,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.679,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.90539763646193,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.5196,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 1.0640839815841943,
+      "learning_rate": 0.0,
+      "loss": 0.6511,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 41706149216256.0,
+      "train_loss": 0.7100516487008486,
+      "train_runtime": 2587.0165,
+      "train_samples_per_second": 1.933,
+      "train_steps_per_second": 0.121
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 41706149216256.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}